This is an automated email from the ASF dual-hosted git repository.
okislal pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/madlib-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 096a07f Image loader python module, and demo notebook
096a07f is described below
commit 096a07fc6666c6f0803e3c97ded70acddd4c0b19
Author: Domino Valdano <[email protected]>
AuthorDate: Mon Jun 17 13:13:39 2019 -0700
Image loader python module, and demo notebook
madlib_image_loader.py:
A python module which can be imported and used to
load image datasets in numpy array format into postgres
or greenplum database as a table, for use with Madlib's
deep learning module. Either import from current directory,
or copy it to somewhere in the python path.
Madlib Image Loader Demo.ipynb:
A Jupyter notebook that demonstrates how to use the module.
Closes #14
Co-authored-by: Nandish Jayaram <[email protected]>
---
community-artifacts/Madlib Image Loader Demo.ipynb | 464 +++++++++++++++++++++
community-artifacts/madlib_image_loader.py | 418 +++++++++++++++++++
2 files changed, 882 insertions(+)
diff --git a/community-artifacts/Madlib Image Loader Demo.ipynb
b/community-artifacts/Madlib Image Loader Demo.ipynb
new file mode 100644
index 0000000..07570f6
--- /dev/null
+++ b/community-artifacts/Madlib Image Loader Demo.ipynb
@@ -0,0 +1,464 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Madlib Image Loader requires psycopg2. If you don't already have it
installed, run:\n",
+ "\n",
+ "`pip install psycopg2-binary`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using TensorFlow backend.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import sys\n",
+ "import os\n",
+ "from keras.datasets import cifar10, cifar100, mnist, fashion_mnist, imdb,
reuters"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Add community-artifacts to PYTHON_PATH\n",
+ " # TIP: You can skip this cell if working directory of notebook is
community-artifacts\n",
+ "\n",
+ "home = %env HOME\n",
+ " # TIP: Change home,'workspace' to wherever you have cloned
madlib-site repo\n",
+ "madlib_site_dir =
os.path.join(home,'workspace','madlib-site','community-artifacts')\n",
+ "sys.path.append(madlib_site_dir)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import image loader module\n",
+ "from madlib_image_loader import ImageLoader, DbCredentials"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Specify database credentials, for connecting to db\n",
+ "db_creds = DbCredentials(db_name='madlib',\n",
+ " user='pivotal',\n",
+ " host='localhost',\n",
+ " port='15432',\n",
+ " password='')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load dataset into np array\n",
+ "train_data, _ = cifar10.load_data()\n",
+ "data_x, data_y = train_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize ImageLoader (increase num_workers to run faster)\n",
+ "iloader = ImageLoader(num_workers=5, db_creds=db_creds)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "MainProcess: Connected to madlib db.\n",
+ "Appending to table cifar_10_test in madlib db\n",
+ "Spawning 5 workers...\n",
+ "Initializing PoolWorker-1 [pid 240]\n",
+ "PoolWorker-1: Created temporary directory PoolWorker-1\n",
+ "Initializing PoolWorker-2 [pid 241]\n",
+ "PoolWorker-2: Created temporary directory PoolWorker-2\n",
+ "Initializing PoolWorker-3 [pid 242]\n",
+ "PoolWorker-3: Created temporary directory PoolWorker-3\n",
+ "Initializing PoolWorker-4 [pid 243]\n",
+ "PoolWorker-4: Created temporary directory PoolWorker-4\n",
+ "Initializing PoolWorker-5 [pid 245]\n",
+ "PoolWorker-1: Connected to madlib db.\n",
+ "PoolWorker-5: Created temporary directory PoolWorker-5\n",
+ "PoolWorker-2: Connected to madlib db.\n",
+ "PoolWorker-3: Connected to madlib db.\n",
+ "PoolWorker-4: Connected to madlib db.\n",
+ "PoolWorker-5: Connected to madlib db.\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0000.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0000.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0000.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0000.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0000.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0001.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0001.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0001.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0001.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0001.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0002.tmp\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0002.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0002.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0002.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0002.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0003.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0003.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0003.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0003.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0003.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0004.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0004.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0004.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0004.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0004.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0005.tmp\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0005.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0005.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0005.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0005.tmp\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0006.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0006.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0006.tmp\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0006.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0006.tmp\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0007.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0007.tmp\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0007.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0007.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0007.tmp\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0008.tmp\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0008.tmp\n",
+ "PoolWorker-2: Wrote 1000 images to
/tmp/madlib_xlKP6JhnfV/cifar_10_test0008.tmp\n",
+ "PoolWorker-4: Wrote 1000 images to
/tmp/madlib_hmXBkZ2Rd5/cifar_10_test0008.tmp\n",
+ "PoolWorker-5: Wrote 1000 images to
/tmp/madlib_4v2Q1jvkZs/cifar_10_test0008.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-4: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-5: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0009.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0009.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0010.tmp\n",
+ "PoolWorker-3: Wrote 1000 images to
/tmp/madlib_kfSWAjQUxH/cifar_10_test0010.tmp\n",
+ "PoolWorker-3: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-1: Wrote 1000 images to
/tmp/madlib_KdLWwZ322f/cifar_10_test0011.tmp\n",
+ "PoolWorker-1: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-2: Removed temporary directory PoolWorker-2\n",
+ "PoolWorker-4: Removed temporary directory PoolWorker-4\n",
+ "PoolWorker-5: Removed temporary directory PoolWorker-5\n",
+ "PoolWorker-3: Removed temporary directory PoolWorker-3\n",
+ "PoolWorker-1: Removed temporary directory PoolWorker-1\n",
+ "Done! Loaded 50000 images in 50.5676851273s\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Save images to temporary directories and load into database\n",
+ "iloader.load_np_array_to_table(data_x, data_y, 'cifar_10_test',
append=False, img_names=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [
+ {
+ "ename": "SyntaxError",
+ "evalue": "invalid syntax (<ipython-input-2-c4a66f4c96da>, line 1)",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;36m File
\u001b[0;32m\"<ipython-input-2-c4a66f4c96da>\"\u001b[0;36m, line
\u001b[0;32m1\u001b[0m\n\u001b[0;31m TODO: no_temp_files option currently
has a bug--it looks like it succeeds, but table ens up being
empty.\u001b[0m\n\u001b[0m
^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid
syntax\n"
+ ]
+ }
+ ],
+ "source": [
+ "TODO: no_temp_files option currently has a bug--it looks like it
succeeds, but table ens up being empty."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Appending to table cifar_10_test in madlib db\n",
+ "Spawning 5 workers...\n",
+ "Initializing PoolWorker-6 [pid 279]\n",
+ "Initializing PoolWorker-7 [pid 280]\n",
+ "Initializing PoolWorker-8 [pid 281]\n",
+ "Initializing PoolWorker-9 [pid 284]\n",
+ "PoolWorker-6: Connected to madlib db.\n",
+ "Initializing PoolWorker-10 [pid 285]\n",
+ "PoolWorker-7: Connected to madlib db.\n",
+ "PoolWorker-8: Connected to madlib db.\n",
+ "PoolWorker-9: Connected to madlib db.\n",
+ "PoolWorker-10: Connected to madlib db.\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-9: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-10: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-8: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-7: Loaded 1000 images into cifar_10_test\n",
+ "PoolWorker-6: Loaded 1000 images into cifar_10_test\n",
+ "Done! Loaded 50000 images in 18.1218080521s\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Same thing, but without writing out any temporary files; everything
handled in-memory.\n",
+ "# Should run about twice as fast.\n",
+ "\n",
+ "iloader.ROWS_PER_FILE = 1000 # Try adjusting this downward, if running
low on memory\n",
+ "iloader.load_np_array_to_table(data_x, data_y, 'cifar_10_test',
append=True, no_temp_files=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Try loading a different dataset\n",
+ "train_data, _ = fashion_mnist.load_data()\n",
+ "data_x, data_y = train_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Executing: CREATE TABLE fashion_mnist_test (id SERIAL, x REAL[], y
TEXT)\n",
+ "CREATE TABLE\n",
+ "Created table fashion_mnist_test in madlib db\n",
+ "Spawning 5 workers...\n",
+ "Initializing PoolWorker-11 [pid 317]\n",
+ "Initializing PoolWorker-12 [pid 318]\n",
+ "Initializing PoolWorker-13 [pid 319]\n",
+ "Initializing PoolWorker-14 [pid 320]\n",
+ "Initializing PoolWorker-15 [pid 321]\n",
+ "PoolWorker-11: Connected to madlib db.\n",
+ "PoolWorker-12: Connected to madlib db.\n",
+ "PoolWorker-13: Connected to madlib db.\n",
+ "PoolWorker-14: Connected to madlib db.\n",
+ "PoolWorker-15: Connected to madlib db.\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-11: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-12: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-13: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-14: Loaded 1000 images into fashion_mnist_test\n",
+ "PoolWorker-15: Loaded 1000 images into fashion_mnist_test\n",
+ "Done! Loaded 60000 images in 11.783424139s\n"
+ ]
+ }
+ ],
+ "source": [
+ "iloader.load_np_array_to_table(data_x, data_y, 'fashion_mnist_test',
append=False, no_temp_files=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 2",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.10"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/community-artifacts/madlib_image_loader.py
b/community-artifacts/madlib_image_loader.py
new file mode 100644
index 0000000..b999036
--- /dev/null
+++ b/community-artifacts/madlib_image_loader.py
@@ -0,0 +1,418 @@
+#
+# Python module to load images into postgres or greenplum db, for
+# use with madlib deep_learning module.
+#
+# The format of the image tables created will have at least 3 rows:
+# (id SERIAL, x REAL[], y). Each row is 1 image,
+# with image data represented by x (a 3D array of type "real"), and
+# y (category) as text. id is just a unique identifier for each image,
+# so they don't get mixed up during prediction.
+#
+# ImageLoader.ROWS_PER_FILE = 1000 by default; this is the number of rows per
+# temporary file (or StringIO buffer) loaded at once.
+#
+
+# User API is through ImageLoader and DbCredentials class constructors,
+# and ImageLoader.load_np_array_to_table
+#
+# 1. Create objects:
+#
+# db_creds = DbCredentials(db_name='madlib', user=None, password='',
+# host='localhost', port=5432)
+#
+# iloader = ImageLoader(db_creds, num_workers, table_name=None)
+#
+# 2. Perform parallel image loading:
+#
+# iloader.load_np_array_to_table(data_x, data_y, table_name,
+# append=False, img_names=None,
+# no_temp_files=False)
+#
+# data_x contains image data in np.array format, and data_y is a 1D np.array
+# of the image categories (labels).
+#
+# Default database credentials are: localhost port 5432, madlib db, no
+# password. Calling the default constructor DbCredentials() will attempt
+# to connect using these credentials, but any of them can be overriden.
+#
+# append=False attempts to create a new table, while append=True appends
more
+# images to an existing table.
+#
+# If the user passes a table_name while creating ImageLoader object, it will
+# be used for all further calls to load_np_array_to_table. It can be
+# changed by passing it as a parameter during the actual call to
+# load_np_array_to_table, and if so future calls will load to that table
+# name instead. This avoids needing to pass the table_name again every
+# time, but also allows it to be changed at any time.
+#
+# EXPERIMENTAL: If no_temp_files=True, the operation will happen without
+# writing out the tables to temporary files before loading
them.
+# Instead, an in-memory filelike buffer (StringIO) will be
used
+# to build the tables before loading.
+#
+# img_names: this is currently unused, but we plan to use it when we add
+# support for loading images from disk.
+
+import numpy as np
+import sys
+import os
+import re
+import gc
+import random
+import string
+import psycopg2 as db
+from multiprocessing import Pool, current_process
+from shutil import rmtree
+import time
+import signal
+import traceback
+from cStringIO import StringIO
+
+class SignalException (Exception):
+ pass
+
+def _worker_sig_handler(signum, frame):
+ if signum == signal.SIGINT:
+ msg = "Received SIGINT in worker."
+ elif signum == signal.SIGTERM:
+ msg = "Received SIGTERM in worker."
+ _worker_cleanup()
+ elif signum == signal.SIGSEGV:
+ msg = "Received SIGSEGV in worker."
+ traceback.print_stack(frame)
+ else:
+ msg = "Received unknown signal in worker"
+
+ raise SignalException(msg)
+
+def _call_worker(data):
+ try:
+ if iloader.no_temp_files:
+ iloader._just_load(data)
+ else:
+ iloader._write_tmp_file_and_load(data)
+ except Exception as e:
+ if iloader.tmp_dir:
+ iloader.rm_temp_dir()
+ # For some reason, when an exception is raised in a worker, the
+ # stack trace doesn't get shown. So we have to print it ourselves
+ # (actual exception # msg will get printed by mother process.
+ #
+ print "\nError in {0} while loading images".format(iloader.pr_name)
+ print traceback.format_exc()
+ raise e
+
+def _worker_cleanup(dummy):
+ # Called when worker process is terminated
+ if iloader.tmp_dir:
+ iloader.rm_temp_dir()
+
+def init_worker(mother_pid, table_name, append, no_temp_files, db_creds):
+ pr = current_process()
+ print("Initializing {0} [pid {1}]".format(pr.name, pr.pid))
+
+ try:
+ iloader = ImageLoader(db_creds=db_creds)
+ iloader.mother_pid = mother_pid
+ iloader.table_name = table_name
+ iloader.no_temp_files = no_temp_files
+ iloader.img_names = None
+ signal.signal(signal.SIGINT, _worker_sig_handler)
+ signal.signal(signal.SIGSEGV, _worker_sig_handler)
+ if not no_temp_files:
+ iloader.mk_temp_dir()
+ iloader.db_connect()
+ except Exception as e:
+ if iloader.tmp_dir:
+ iloader.rm_temp_dir()
+ print "\nException in {0} init_worker:".format(pr.name)
+ print traceback.format_exc()
+ raise e
+
+class DbCredentials:
+ def __init__(self, db_name='madlib', user=None, password='',
+ host='localhost', port=5432):
+ if user:
+ self.user = user
+ else:
+ self.user = os.environ["USER"]
+
+ self.db_name = db_name
+ self.password = password
+ self.host = host
+ self.port = port
+
+class ImageLoader:
+ def __init__(self, db_creds=None, num_workers=None, table_name=None):
+ self.num_workers = num_workers
+ self.append = False
+ self.img_num = 0
+ self.db_creds = db_creds
+ self.db_conn = None
+ self.db_cur = None
+ self.tmp_dir = None
+ self.mother = False
+ self.pr_name = current_process().name
+ self.table_name = table_name
+
+ global iloader # Singleton per process
+ iloader = self
+
+ def _random_string(self):
+ return ''.join([random.choice(string.ascii_letters + string.digits)\
+ for n in xrange(10)])
+
+ def mk_temp_dir(self):
+ self.tmp_dir = '/tmp/madlib_{0}'.format(self._random_string())
+ os.mkdir(self.tmp_dir)
+ print("{0}: Created temporary directory {0}"\
+ .format(self.pr_name, self.tmp_dir))
+
+ def rm_temp_dir(self):
+ rmtree(self.tmp_dir)
+ self.tmp_dir = None
+ print("{0}: Removed temporary directory {0}"\
+ .format(self.pr_name, self.tmp_dir))
+
+ def db_connect(self):
+ if self.db_cur:
+ return
+
+ db_name = self.db_creds.db_name
+ user = self.db_creds.user
+ host = self.db_creds.host
+ port = self.db_creds.port
+ password = self.db_creds.password
+ connection_string = "dbname={0} user={1} host={2} port={3}
password={4}"\
+ .format(db_name, user, host, port, password)
+
+ try:
+ self.db_conn = db.connect(connection_string)
+ self.db_cur = self.db_conn.cursor()
+ self.db_conn.autocommit = True
+
+ except db.DatabaseError as error:
+ self.db_close()
+ print(error)
+ raise error
+
+ print("{0}: Connected to {1} db.".
+ format(self.pr_name, self.db_creds.db_name))
+
+ def db_exec(self, query, args=None, echo=True):
+ if self.db_cur is not None:
+ if echo:
+ print "Executing: {0}".format(query)
+ self.db_cur.execute(query, args)
+ if echo:
+ print self.db_cur.statusmessage
+ else:
+ raise RuntimeError("{0}: db_cur is None in db_exec"\
+ .format(self.pr_name))
+
+ def db_close(self):
+ if self.db_cur is not None:
+ self.db_cur.close()
+ self.db_cur = None
+ if isinstance(self.db_conn, db.extensions.connection):
+ self.db_conn.close()
+ self.db_conn = None
+
+ def _gen_lines(self, data, img_names=None):
+ for i, row in enumerate(data):
+ x, y = row
+ line = str(x.tolist())
+ line = line.replace('[','{').replace(']','}')
+ if img_names:
+ line = '{0}|{1}|{2}\n'.format(line, y, img_names[i])
+ else:
+ line = '{0}|{1}\n'.format(line, y)
+ yield line
+
+ def _write_file(self, file_object, data, img_names=None):
+ lines = self._gen_lines(data, img_names)
+ file_object.writelines(lines)
+
+ ROWS_PER_FILE = 1000
+
+ # Copies from open file-like object f into database
+ def _copy_into_db(self, f, data):
+ table_name = self.table_name
+ img_names = self.img_names
+
+ if img_names:
+ self.db_cur.copy_from(f, table_name, sep='|', columns=['x','y',
+ 'img_name'])
+ else:
+ self.db_cur.copy_from(f, table_name, sep='|', columns=['x','y'])
+
+ print("{0}: Loaded {1} images into {2}".format(self.pr_name, len(data),
+ self.table_name))
+
+ # Use in-memory buffer as file-like object to load a block of data into db
+ # (no temp files written)
+ def _just_load(self, data):
+ f = StringIO()
+ self._write_file(f, data)
+ self._copy_into_db(f, data)
+ f.close()
+
+ # Write out a temporary file and then load it into db as a table
+ def _write_tmp_file_and_load(self, data):
+ table_name = self.table_name
+
+ if not self.tmp_dir:
+ print("{0}: Can't find temporary directory... exiting."\
+ .format(self.pr_name))
+ time.sleep(1) # allow some time for p.terminate() to be called
+ return
+
+ filename = os.path.join(self.tmp_dir, '{0}{1:04}.tmp'.format(
+ table_name, self.img_num))
+
+ self.img_num += 1
+ with file(filename, 'w') as f:
+ self._write_file(f, data)
+
+ print("{0}: Wrote {1} images to {2}".format(self.pr_name, len(data),
+ filename))
+
+ with file(filename, 'r') as f:
+ self._copy_into_db(f, data)
+
+ def load_np_array_to_table(self, data_x, data_y, table_name=None,
+ append=False, img_names=None,
+ no_temp_files=False):
+ """
+ Loads a numpy array into db. For append=False, creates a new table and
+ loads the data. For append=True, appends data to existing table.
+ Throws an exception if append=False and table_name already exists,
+ or if append=True and table_name does not exist. Makes use of
+ worker processes initialized during ImageLoader object creation to
+ load in parallel.
+ @data_x independent variable data, a numpy array of images. Size of
+ first dimension is number of images. Rest of dimensions determined
+ by image resolution and number of channels.
+ @data_y dependent variable data (image classes), as an numpy array
+ @table_name Name of table in db to load data into
+ @append Whether to create a new table (False) or append to an existing
+ one (True). If unspecified, default is False @img_names If not
None,
+ a list of the image names corresponding to elements of the data_x
+ numpy array. If present, this is included as a column in the
table.
+ @no_temp_files If specified, no temporary files are written--all
+ operations are performed in-memory.
+
+ """
+ start_time = time.time()
+ self.mother = True
+ self.append = append
+ if table_name:
+ self.table_name = table_name
+
+ if not self.table_name:
+ raise ValueError("Must specify table_name either in ImageLoader"
+ " constructor or in load_np_array_to_table params!")
+
+ if len(data_x) != len(data_y):
+ raise ValueError("Invalid dataset passed, number of labels in "
+ "data_y ({0}) does not match number of images "
+ "in data_x ({1})"\
+ .format(len(data_y), len(data_x)))
+
+ self.db_connect()
+
+ if self.append:
+ # Validate that table already exists
+ try:
+ self.db_exec("SELECT count(*) FROM
{0}".format(self.table_name),
+ echo=False)
+ except db.DatabaseError:
+ raise RuntimeError("append=True passed, but cannot append to "
+ "table {0} in db {1}. Either make sure the
"
+ "table exists and you have access to it, or
"
+ "use append=False (default) to auto-create
it"
+ "during loading."
+ .format(self.table_name, self.db_creds.db_name))
+
+ print "Appending to table {0} in {1} db".format(self.table_name,
+
self.db_creds.db_name)
+ else:
+ # Create new table
+ try:
+ if img_names:
+ sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT,\
+ img_name TEXT)".format(self.table_name)
+ else:
+ sql = "CREATE TABLE {0} (id SERIAL, x REAL[], y TEXT)"\
+ .format( self.table_name)
+ self.db_exec(sql)
+ except(Exception, db.DatabaseError):
+ raise RuntimeError("Table {0} already exists in {1} db. Use "
+ "append=True to append more images to it."
+ .format(self.table_name, self.db_creds.db_name))
+
+ print "Created table {0} in {1} db".format(self.table_name,
+ self.db_creds.db_name)
+
+ self.db_close()
+
+ data_y = data_y.flatten()
+ data = zip(data_x, data_y)
+
+ print("Spawning {0} workers...".format(self.num_workers))
+
+ p = Pool(processes=self.num_workers,
+ initializer=init_worker,
+ initargs=(current_process().pid,
+ self.table_name,
+ self.append,
+ no_temp_files,
+ self.db_creds))
+
+
+ datas = []
+
+ for n in range(0, len(data), self.ROWS_PER_FILE):
+ datas.append(data[n:n+self.ROWS_PER_FILE])
+
+ #
+ # Each element in datas is a list of self.ROWS_PER_FILE rows
+ #
+ # Shape of datas: ( number of files, rows per file, ( x-dim, y-dim )
)
+ #
+ # ( inside x can also be a numpy tensor with several dimensions, but y
+ # should just be a single scalar )
+ #
+ # multiprocessing library will call _call_worker() in some worker for
+ # each file, splitting the list of files up into roughly equal chunks
+ # for each worker to handle. For example, if there are 500 files and
+ # 5 workers, each will handle about 100 files, and _call_worker()
will
+ # be called 100 times, each time with a different file full of
images.
+ #
+
+ try:
+ p.map(_call_worker, datas)
+ except(Exception) as e:
+ p.map(_worker_cleanup, [0] * self.num_workers)
+ p.terminate()
+ raise e
+
+ p.map(_worker_cleanup, [0] * self.num_workers)
+ end_time = time.time()
+ print("Done! Loaded {0} images in {1}s"\
+ .format(len(data), end_time - start_time))
+ p.terminate()
+
+# Uncommenting the code below can be useful for testing, but will be removed
+# once we add a main() function intended to be called by a user who wants to
+# load images from disk.
+#
+# def test_loading_nparray(data_x, data_y):
+# db_creds = DbCredentials(port=5094)
+# iloader = ImageLoader(num_workers=5, table_name='cifar_10_test',
+# db_creds=db_creds)
+# iloader.load_np_array_to_table(data_x, data_y, append=True)
+#
+# if __name__ == '__main__':
+# train_data, _ = cifar10.load_data()
+# test_loading_nparray(*train_data)