Filippo Giunchedi has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/351793 )

Change subject: thumbstats: add Hive export
......................................................................

thumbstats: add Hive export

Bug: T162796
Change-Id: Ia6cd8e66ff7da58ca5e2fe164a220e9d40b5391b
---
M thumbstats/swift-thumb-stats
1 file changed, 40 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/software 
refs/changes/93/351793/1

diff --git a/thumbstats/swift-thumb-stats b/thumbstats/swift-thumb-stats
index 960fdef..67b21aa 100755
--- a/thumbstats/swift-thumb-stats
+++ b/thumbstats/swift-thumb-stats
@@ -34,6 +34,34 @@
         pass
 
 
+class HiveExport(Filter):
+    """Print to stdout a dump of thumbnails found, ready for import into Hive
+    for further analysis.
+
+    The Hive statements below will create a suitable table and import the dump.
+    Note that the dump needs to live on the same server as the hive server is
+    running on. The ORC conversion step is needed for efficient storage and
+    querying by Hive.
+
+    CREATE TABLE thumbstats_raw (name string, pixels int, container string, 
path string, bytes int, last_modified timestamp, content_type string);
+    CREATE TABLE thumbstats (name string, pixels int, container string, path 
string, bytes int, last_modified timestamp, content_type string) STORED AS ORC 
tblproperties ("orc.compress" = "SNAPPY");
+    LOAD DATA LOCAL INPATH "<dumpfile>" OVERWRITE INTO TABLE thumbstats_raw;
+    INSERT OVERWRITE thumbstats SELECT * FROM thumbstats_raw;
+    """
+
+    def process(self, thumb):
+        # Hive < 1.2.0 requires a fixed timestamp format on load, see HIVE-9298
+        thumb.last_modified = thumb.last_modified.replace('T', ' ')
+        thumb.last_modified += '000'
+        export_fields = (thumb.name, thumb.thumbsize, thumb.container_name,
+                thumb.filename, thumb.bytes, thumb.last_modified,
+                thumb.content_type)
+        print "\x01".join([unicode(x).encode('utf8') for x in export_fields])
+
+    def result(self):
+        return {}
+
+
 class BytesPerSize(Filter):
     """Size vs bytes breakdown."""
     _bytes = {}
@@ -94,7 +122,7 @@
         _, listing = connection.get_container(name, limit=limit, marker=marker)
 
 
-def iter_thumbs(container):
+def iter_thumbs(container, container_name):
     """Iterate over the container contents and yield Thumb objects."""
 
     for thumb in container:
@@ -102,6 +130,7 @@
         if not m:
             continue
         t = Thumb()
+        t.container_name = container_name
         t.name = m.group('name')
         t.thumbsize = m.group('size')
         t.filename = thumb['name']
@@ -116,7 +145,7 @@
     """Iterate over the container thumbs and pass items to each filter."""
 
     container_name = container['name']
-    thumbs = iter_thumbs(iter_container(connection, container_name))
+    thumbs = iter_thumbs(iter_container(connection, container_name), 
container_name)
     start = datetime.datetime.utcnow()
     for i, thumb in enumerate(thumbs):
         thumb.container_name = container_name
@@ -166,16 +195,19 @@
         description="Print swift thumb statistics")
     parser.add_argument(
         '-A', '--auth', dest='auth', default=os.environ.get('ST_AUTH', None),
-        help='URL for obtaining an auth token (ST_AUTH)')
+        help='URL for obtaining an auth token')
     parser.add_argument(
         '-U', '--user', dest='user', default=os.environ.get('ST_USER', None),
-        help='User name for obtaining an auth token (ST_USER)')
+        help='User name for obtaining an auth token')
     parser.add_argument(
         '-K', '--key', dest='key', default=os.environ.get('ST_KEY', None),
-        help='Key for obtaining an auth token (ST_KEY)')
+        help='Key for obtaining an auth token')
     parser.add_argument(
         '-t', '--threads', dest='threads', default=3, type=int,
         help='How many threads to use (%(default)s)')
+    parser.add_argument(
+        '--hive_export', dest='hive_export', default=False, 
action='store_true',
+        help='Export thumbnail data for Hive processing')
     args = parser.parse_args()
 
     if None in (args.auth, args.user, args.key):
@@ -188,6 +220,9 @@
     filters = [BytesPerSize(), CountPerSize(), BytesByMonth()]
     threads = []
 
+    if args.hive_export:
+        filters = [HiveExport()]
+
     for i in range(args.threads):
         thread_connection = swiftclient.Connection(
             args.auth, args.user, args.key, retry_on_ratelimit=True)

-- 
To view, visit https://gerrit.wikimedia.org/r/351793
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia6cd8e66ff7da58ca5e2fe164a220e9d40b5391b
Gerrit-PatchSet: 1
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: Filippo Giunchedi <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to