Hi,
Here is my first report. You can also find it on my Gitlab [0].
Week 1 - 2014/05/25
For this first week, I have written a test script that generates some
simple datasets, and produces an image containing the output of the MADlib
clustering algorithms.
This script can be called like this:
./clustering_test.py new ds0 -n 8 # generates a dataset called "ds0"
with 8 clusters
./clustering_test.py query ds0 -o output.png # outputs the result of
the clustering algorithms applied to ds0 in output.png
See ./clustering_test.py -h for all the available options.
An example of output can be found here
[1].<http://git.viod.eu/viod/gsoc_2014/blob/master/clustering_test/example_dataset.png>
Of course, I will keep improving this test script, as it is still far from
perfect; but for now, it does approximately what I want.
For next week, I'll start working on the implementation of k-medoids in
MADlib. As a reminder, according to the timeline I suggested for the
project, this step must be done on May 30. Depending on the problems I will
face (mostly lack of knowledge of the codebase, I guess), this might not be
finished on time, but it should be done a few days later (by the end of
next week, hopefully).
Attached is the patch containing everything I have done this week, though
the git log might be more convenient to read.
Regards,
Maxence A.
[0] http://git.viod.eu/viod/gsoc_2014/blob/master/reports.rst
[1]
http://git.viod.eu/viod/gsoc_2014/blob/master/clustering_test/example_dataset.png
--
Maxence Ahlouche
06 06 66 97 00
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..97de20e
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+***/__pycache__/
+**.png
\ No newline at end of file
diff --git a/autogen_results.py b/autogen_results.py
deleted file mode 100755
index 033c309..0000000
--- a/autogen_results.py
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/python
-
-import os
-
-while(True):
- os.system("./k-means_test.py --regen -o results/$(date | md5sum | cut -d ' ' -f 1).png")
diff --git a/clustering_test.py b/clustering_test.py
deleted file mode 100755
index 2afc0d1..0000000
--- a/clustering_test.py
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import psycopg2 as pg
-import sys
-
-
-class DatabaseConnection():
- db_name = 'madlib'
- user = 'madlib'
- host = 'localhost'
- port = 5432
- table_name = 'tmp_points'
- field_name = 'coords'
-
- def __init__(self):
- self.conn = pg.connect(database=self.db_name, user=self.user, host=self.host, port=5432)
- self.cur = self.conn.cursor()
- self.cur.execute('DROP TABLE IF EXISTS %s CASCADE;' % self.table_name)
- self.cur.execute('CREATE TABLE %s (id SERIAL PRIMARY KEY, coords INT[]);' % self.table_name)
- self.conn.commit()
-
- def __del__(self):
- self.cur.close()
- self.conn.close()
-
-
-def main(args):
- parser = argparse.ArgumentParser(description='Visualize output of the clustering algorithms provided by MADlib, in PostgreSQL.')
- parser.add_argument('-n', metavar='number of clusters', type=int)
-
- dc = DatabaseConnection()
-
-if __name__ == '__main__':
- main(sys.argv[1:])
diff --git a/clustering_test/autogen_results.py b/clustering_test/autogen_results.py
new file mode 100755
index 0000000..033c309
--- /dev/null
+++ b/clustering_test/autogen_results.py
@@ -0,0 +1,6 @@
+#!/usr/bin/python
+
+import os
+
+while(True):
+ os.system("./k-means_test.py --regen -o results/$(date | md5sum | cut -d ' ' -f 1).png")
diff --git a/clustering_test/clustering_test.py b/clustering_test/clustering_test.py
new file mode 100755
index 0000000..248b5cf
--- /dev/null
+++ b/clustering_test/clustering_test.py
@@ -0,0 +1,63 @@
+#!/usr/bin/env python3
+
+import argparse
+
+import database as db
+import dataset_generator as ds
+import visualiser as vs
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(
+ description='Visualize output of the clustering algorithms provided by '
+ 'MADlib, in PostgreSQL. You should start by adding a dataset. You need '
+ 'a PostgreSQL running.')
+ subparsers = parser.add_subparsers(help='subparsers help', dest='action')
+
+ parser_dataset = subparsers.add_parser('new', help='generate a dataset')
+ parser_dataset.add_argument(
+ 'dataset_name',
+ help='the name of the dataset to create',
+ )
+ parser_dataset.add_argument(
+ '-n',
+ '--nb_clusters',
+ type=int,
+ help='the number of clusters composing the new dataset. Defaults to a '
+ 'random value between 2 and 10.',
+ )
+ parser_dataset.add_argument(
+ '-d',
+ '--distribution',
+ choices = ds.gen_cluster.keys(),
+ help='the distribution of the points in the clusters. Defaults to '
+ 'uniform.',
+ )
+
+ parser_query = subparsers.add_parser('query', help='apply clustering algorithms on a dataset')
+ parser_query.add_argument(
+ 'dataset_name',
+ help='the name of the dataset to query',
+ )
+ parser_query.add_argument(
+ '-n',
+ '--nb_clusters',
+ type=int,
+ help='the number of clusters in the dataset. Default to the actual number of '
+ 'clusters of the dataset.',
+ )
+ parser_query.add_argument(
+ '-o',
+ '--output_file',
+ help='the file in which the output will be saved.',
+ )
+
+ args = parser.parse_args()
+
+ if args.action == 'new':
+ if args.nb_clusters is None:
+ ds.gen_dataset(args.dataset_name)
+ else:
+ ds.gen_dataset(args.dataset_name, args.nb_clusters)
+ elif args.action == 'query':
+ vs.gen_image(args.dataset_name, args.output_file)
diff --git a/clustering_test/database.py b/clustering_test/database.py
new file mode 100644
index 0000000..c47256d
--- /dev/null
+++ b/clustering_test/database.py
@@ -0,0 +1,163 @@
+'''
+Create the tables in the database and query them.
+'''
+
+import psycopg2 as pg
+
+
+# database parameters
+db_name = 'madlib'
+user = 'madlib'
+host = 'localhost'
+port = 5432
+table_name = 'tmp_point'
+field_name = 'coords'
+
+# connection and cursor objects
+conn = None
+cur = None
+
+# available clustering algorithms
+clustering_algorithms = [
+ 'kmeans_random',
+ 'kmeanspp',
+]
+
+
+class DatabaseConnection():
+ def __init__(self, f):
+ self.f = f
+
+ def __call__(self, *k, **kw):
+ if conn is None:
+ self.connect()
+ result = self.f(*k, **kw)
+ conn.commit()
+ return result
+
+ def connect(self):
+ global conn, cur
+ conn = pg.connect(database=db_name, user=user, host=host, port=5432)
+ cur = conn.cursor()
+
+ # create the model, if it doesn't exist (should only be executed once)
+ cur.execute('CREATE TABLE IF NOT EXISTS dataset ('
+ 'id serial PRIMARY KEY, '
+ 'name varchar(80) UNIQUE, '
+ 'nb_clusters int);')
+ cur.execute('CREATE TABLE IF NOT EXISTS point ('
+ 'id serial PRIMARY KEY,'
+ 'coords int[],'
+ 'dataset_id int REFERENCES dataset(id) ON DELETE CASCADE,'
+ 'cluster_id int);')
+
+ # create the temporary points table
+ cur.execute('CREATE TABLE IF NOT EXISTS %s ('
+ 'id int REFERENCES point(id) ON DELETE CASCADE, '
+ '%s int[]);'
+ % (table_name, field_name))
+
+
+@DatabaseConnection
+def insert_dataset(dataset_name, clusters):
+ # insert the dataset
+ cur.execute('INSERT INTO dataset (name, nb_clusters) '
+ 'VALUES (%s, %s) '
+ 'RETURNING id;',
+ [dataset_name, len(clusters)])
+
+ # get the dataset's id
+ dataset_id = cur.fetchone()[0]
+
+ # insert the dataset's points
+ for i, cluster in enumerate(clusters):
+ query = 'INSERT INTO point (coords, dataset_id, cluster_id) VALUES\n'
+ for j, point in enumerate(cluster):
+ query += "('{" + str(point[0]) + ', ' + str(point[1]) + "}', " + \
+ str(dataset_id) + ', ' + str(i) + ')'
+ query = query + ',\n' if j < len(cluster) - 1 else query + '\n'
+ query += ';'
+
+ cur.execute(query)
+
+
+@DatabaseConnection
+def load_dataset(dataset_name):
+ '''
+ Load the chosen dataset in the table_name table.
+ '''
+
+ cur.execute('DELETE FROM tmp_point;')
+ cur.execute("INSERT INTO %s (id, coords) "
+ "SELECT point.id, point.coords "
+ "FROM point "
+ "JOIN dataset ON point.dataset_id = dataset.id "
+ "WHERE dataset.name = '%s';"
+ % (table_name, dataset_name))
+
+
+@DatabaseConnection
+def get_nb_clusters(dataset):
+ cur.execute(
+ 'SELECT nb_clusters '
+ 'FROM dataset '
+ 'WHERE name = %s;',
+ [dataset]
+ )
+
+ return cur.fetchone()[0]
+
+
+@DatabaseConnection
+def get_centroids(clustering_alg, nb_clusters):
+ '''
+ Apply the clustering_algorithms to the current dataset loaded in the temp
+ table.
+ '''
+ cur.execute(
+ "SELECT * "
+ "FROM madlib.%s('%s', '%s', %s);"
+ % (clustering_alg, table_name, field_name, nb_clusters)
+ )
+ return cur.fetchone()
+
+
+@DatabaseConnection
+def get_points(dataset=None):
+ '''
+ Get all the points of a specific dataset. If the dataset is not
+ provided, return the points in the temp table instead.
+ '''
+
+ if dataset is None:
+ cur.execute(
+ 'SELECT id, %s, cluster_id '
+ 'FROM %s;'
+ % (field_name, table_name)
+ )
+
+ return cur.fetchall()
+
+ else:
+ cur.execute(
+ 'SELECT id, coords, cluster_id '
+ 'FROM point '
+ 'WHERE dataset_id IN (SELECT id FROM dataset WHERE name = %s LIMIT 1);',
+ [dataset]
+ )
+
+ return cur.fetchall()
+
+
+@DatabaseConnection
+def get_current_dataset():
+ cur.execute(
+ 'SELECT ds.id, ds.name '
+ 'FROM DATASET ds '
+ 'JOIN point p ON ds.id = p.dataset_id '
+ 'JOIN %s tp on p.id = tp.id '
+ 'LIMIT 1;'
+ % table_name
+ )
+
+ return cur.fetchone()
diff --git a/clustering_test/dataset_generator.py b/clustering_test/dataset_generator.py
new file mode 100644
index 0000000..0248f4a
--- /dev/null
+++ b/clustering_test/dataset_generator.py
@@ -0,0 +1,51 @@
+import database as db
+import random
+
+'''
+Generate small toy datasets to test clustering algorithms.
+'''
+
+max_x = 300
+max_y = 300
+
+
+def gen_uniform_cluster(
+ nb_points=None,
+ lower_x=None,
+ upper_x=None,
+ lower_y=None,
+ upper_y=None,
+):
+ if lower_x is None:
+ lower_x = random.randint(0, max_x-1)
+ if upper_x is None:
+ upper_x = random.randint(lower_x, max_x-1)
+ if lower_y is None:
+ lower_y = random.randint(0, max_y-1)
+ if upper_y is None:
+ upper_y = random.randint(lower_y, max_y-1)
+ if nb_points is None:
+ nb_points = random.randint(100, 1000)
+
+ cluster = []
+ for i in range(nb_points):
+ cluster.append((random.randint(lower_x, upper_x), random.randint(lower_y, upper_y)))
+
+ return cluster
+
+
+def gen_dataset(
+ dataset_name,
+ nb_clusters=random.randint(0, 10),
+ distribution='uniform',
+ ):
+ clusters = []
+ for i in range(nb_clusters):
+ clusters.append(gen_cluster[distribution]())
+
+ db.insert_dataset(dataset_name, clusters)
+
+
+gen_cluster = {
+ 'uniform': gen_uniform_cluster,
+}
diff --git a/clustering_test/requirements.txt b/clustering_test/requirements.txt
new file mode 100644
index 0000000..615e61e
--- /dev/null
+++ b/clustering_test/requirements.txt
@@ -0,0 +1,2 @@
+Pillow==2.4.0
+psycopg2==2.5.3
diff --git a/clustering_test/visualiser.py b/clustering_test/visualiser.py
new file mode 100644
index 0000000..fc69a65
--- /dev/null
+++ b/clustering_test/visualiser.py
@@ -0,0 +1,139 @@
+import math
+from PIL import Image, ImageDraw
+
+import database as db
+import dataset_generator as dsg
+
+
+colors = [
+ (255, 0, 0), # red
+ (0, 255, 0), # green
+ (0, 0, 255), # blue
+ (255, 255, 0), # yellow
+ (0, 255, 255), # cyan
+ (255, 0, 255), # pink
+ (96, 0, 0), # dark_red
+ (0, 96, 0), # dark_green
+ (0, 0, 96), # dark_blue
+ (96, 96, 96), # grey
+ (0, 0, 0), # black
+]
+
+
+def distance(p1, p2):
+ '''
+ Compute the distance between p1 and p2.
+ '''
+
+ return math.sqrt(math.pow(p2[0] - p1[0], 2) + math.pow(p2[1] - p1[1], 2))
+
+
+def nearest_centroid(point, centroids):
+ '''
+ Assign a point to its nearest centroid.
+ Returns the indice of the nearest centroid.
+ '''
+
+ nearest_centroid = 0
+ min_dist = distance(point[1], centroids[nearest_centroid])
+
+ for i, centroid in enumerate(centroids):
+ dist = distance(point[1], centroid)
+ if dist < min_dist:
+ min_dist = dist
+ nearest_centroid = i
+
+ return nearest_centroid
+
+
+def cluster(clustering_alg, dataset=None):
+ '''
+ Return the result of the clustering algorithms applied to dataset.
+ Returns the list of centroids, and a list of ((x, y), cluster_id).
+
+ dataset defaults to the last one used.
+ '''
+
+ # if no dataset specified, keep the current one, else update the temp table
+ if dataset is not None:
+ db.load_dataset(dataset)
+ else:
+ dataset = db.get_current_dataset()[1]
+
+ nb_clusters = db.get_nb_clusters(dataset)
+
+ # get the centroids and the points
+ centroids = db.get_centroids(clustering_alg, nb_clusters)[0]
+ points = db.get_points(dataset)
+
+ assigned_points = []
+ for point in points:
+ assigned_points.append((point, nearest_centroid(point, centroids)))
+
+ return centroids, assigned_points
+
+
+def gen_image(dataset=None, output_file=None):
+ '''
+ Write the output of the clustering algorithms in output_file.
+ If output_file is not provided, defaults to <dataset>.png
+ '''
+
+ def draw_centroid(bitmap, centroid, color):
+ # draw a black square
+ for i in range(max(int(centroid[0]) - 3, 0.), min(int(centroid[0]) + 3, dsg.max_x)):
+ for j in range(max(int(centroid[1]) - 3, 0), min(int(centroid[1]) + 3, dsg.max_x)):
+ bitmap[i * dsg.max_x + j] = colors[10]
+
+ # fill it with the correct color
+ for i in range(max(int(centroid[0]) - 2, 0.), min(int(centroid[0]) + 2, dsg.max_x)):
+ for j in range(max(int(centroid[1]) - 2, 0), min(int(centroid[1]) + 2, dsg.max_x)):
+ bitmap[i * dsg.max_x + j] = color
+
+ def draw_point(bitmap, point, color):
+ bitmap[point[0] * dsg.max_x + point[1]] = color
+
+ def draw_text(img, text):
+ draw = ImageDraw.Draw(img)
+ draw.text((10, 10), text, fill=colors[10]) # black
+
+ # if no dataset specified, get the current dataset
+ if dataset is None:
+ dataset = db.get_current_dataset()[1]
+
+ # if no output_file specified, name it after the dataset
+ if output_file is None:
+ output_file = dataset + '.png'
+
+ # draw the original clustering
+ img = Image.new("RGB", (dsg.max_x, dsg.max_y))
+ bitmap = [(255, 255, 255)] * dsg.max_x * dsg.max_y
+
+ points = db.get_points(dataset)
+ for point in points:
+ draw_point(bitmap, point[1], colors[point[2]])
+
+ img.putdata(bitmap)
+ draw_text(img, 'Original clustering')
+
+ result_img = Image.new('RGB', (dsg.max_x, dsg.max_y * (len(db.clustering_algorithms)+1)))
+ result_img.paste(img, (0, 0))
+
+ # draw ths output of the clustering algorithms
+ for i, clustering_alg in enumerate(db.clustering_algorithms):
+ centroids, points = cluster(db.clustering_algorithms[0], dataset)
+ bitmap = [(255, 255, 255)] * dsg.max_x * dsg.max_y
+
+ for point in points:
+ draw_point(bitmap, point[0][1], colors[point[1]])
+
+ for j, centroid in enumerate(centroids):
+ draw_centroid(bitmap, centroid, colors[j])
+
+ img = Image.new("RGB", (dsg.max_x, dsg.max_y))
+ img.putdata(bitmap)
+ draw_text(img, clustering_alg)
+
+ result_img.paste(img, (0, (i+1) * (dsg.max_x+1)))
+
+ result_img.save(output_file)
diff --git a/k-means_test.py b/k-means_test.py
deleted file mode 100755
index 7d46de7..0000000
--- a/k-means_test.py
+++ /dev/null
@@ -1,350 +0,0 @@
-#!/usr/bin/python
-
-import postgresql
-import random
-import sys
-import getopt
-import math
-import pickle
-import time
-from PIL import Image, ImageDraw
-
-# db informations
-db_name = "madlib"
-db_user = "viod"
-db_server = "localhost"
-db_port = 5432
-db_table_name = "k_means_test"
-db_field_name = "coord"
-db = postgresql.open("pq://" + db_user + "@" + db_server + ":" + str(db_port) + "/" + db_name)
-
-# dataset informations
-ds_max_groups = 10
-ds_max_x = 300
-ds_max_y = 300
-group_max_elts = 1000
-group_max_width = 100
-group_max_height = 100
-
-default_output_file = "clustered_data.png"
-data_file = "clusters.dat"
-
-colors = [
- (255, 0, 0), # red
- (0, 255, 0), # green
- (0, 0, 255), # blue
- (255, 255, 0), # yellow
- (0, 255, 255), # cyan
- (255, 0, 255), # pink
- (96, 0, 0), # dark_red
- (0, 96, 0), # dark_green
- (0, 0, 96), # dark_blue
- (96, 96, 96), # grey
- (0, 0, 0) # black
- ]
-
-def create_test_table():
- """
- Create or replace the data table
- """
- try:
- db.execute("DROP TABLE IF EXISTS " + db_table_name + " CASCADE;")
- except UndefinedTableError:
- pass
- db.execute("CREATE TABLE " + db_table_name + " (" +
- "id SERIAL PRIMARY KEY, " +
- db_field_name + " int[]" +
- ");")
-
-def gaussian_random(lower_bound, upper_bound):
- """
- Generate a random number between lower_bound and upper_bound, assuming a gaussian repartition
- """
- mean = (upper_bound + lower_bound) / 2
- variance = (upper_bound - lower_bound) / 4
- x = random.gauss(mean, variance)
- while(x < lower_bound or x > upper_bound):
- x = random.gauss(mean, variance)
- return int(x)
-
-def insert_random_data(nb_groups):
- """
- Populate the table with groups of points chosen randomly
- """
- clusters = []
-
- # for each group
- for i in range(nb_groups):
- width = random.randint(1, group_max_width)
- height = random.randint(1, group_max_height)
- nb_elts = random.randint(1, group_max_elts)
- min_x = random.randint(1, ds_max_x - width)
- min_y = random.randint(1, ds_max_y - height)
- clusters.append( ((min_x + width/2, min_y + height/2), []) )
-
- # points generation
- for j in range(nb_elts):
- x = gaussian_random(min_x, min_x + width)
- y = gaussian_random(min_y, min_y + height)
- clusters[i][1].append((x,y))
- db.execute("INSERT INTO " + db_table_name + " (" + db_field_name + ") VALUES (" +
- "'{" + str(x) + "," + str(y) + "}');")
-
- # save clusters informations in a file
- data_dump = open(data_file, "wb")
- pickle.dump(nb_groups, data_dump)
- pickle.dump(clusters, data_dump)
- data_dump.close()
- return clusters
-
-def get_points():
- """
- Get back the points previously generated
- """
- c = db.prepare("SELECT " + db_field_name + " FROM " + db_table_name + ";").declare()
- points = []
- for p in c:
- points.append(list(p[0]))
- return points
-
-def apply_clustering_kmeans(nb_groups):
- """
- Call to MADlib's k-means clustering function
- """
- c = db.prepare("SELECT * FROM madlib.kmeans_random('" + db_table_name + "', '" +
- db_field_name + "', " + str(nb_groups) + ");").declare()
- result = c.read()[0]
- centroids = result[0]
- #objective_fn = result[1]
- #frac_reassigned = result[2]
- #num_iterations = result[3]
-
- # init clusters
- clusters = []
- for c in centroids:
- clusters.append((c, []))
-
- # assign each point to its cluster
- points = get_points()
- for p in points:
- # compute distances
- distances = []
- for c in centroids:
- distances.append(math.pow(c[0] - p[0], 2) + math.pow(c[1] - p[1], 2))
- # get the indice of the nearest centroid
- nearest = 0
- for i in range(1, len(distances)):
- if(distances[i] < distances[nearest]):
- nearest = i
- clusters[nearest][1].append(p)
-
- return clusters
-
-def apply_clustering_kmeanspp(nb_groups):
- """
- Call to MADlib's k-means clustering function
- """
- c = db.prepare("SELECT * FROM madlib.kmeanspp('" + db_table_name + "', '" +
- db_field_name + "', " + str(nb_groups) + ");").declare()
- result = c.read()[0]
- centroids = result[0]
- #objective_fn = result[1]
- #frac_reassigned = result[2]
- #num_iterations = result[3]
-
- # init clusters
- clusters = []
- for c in centroids:
- clusters.append((c, []))
-
- # assign each point to its cluster
- points = get_points()
- for p in points:
- # compute distances
- distances = []
- for c in centroids:
- distances.append(math.pow(c[0] - p[0], 2) + math.pow(c[1] - p[1], 2))
- # get the indice of the nearest centroid
- nearest = 0
- for i in range(1, len(distances)):
- if(distances[i] < distances[nearest]):
- nearest = i
- clusters[nearest][1].append(p)
-
- return clusters
-
-def export_to_png(clusters):
- """
- Visualize the result in a PNG file
- """
- def display_centroid(bitmap, x, y, color):
- """
- Display a big colored square to represent a centroid
- """
- # Draw a black square
-
- # vertical lines
- for i in max(0, int(x)-3), min(ds_max_x, int(x)+3):
- for j in range(max(0,int(y)-3),min(ds_max_y,int(y)+4)):
- bitmap[j * ds_max_x + i] = colors[10] # black
- # horizontal lines
- for i in range(max(0,int(x)-3), min(ds_max_x,int(x)+4)):
- for j in max(0,int(y)-3), min(ds_max_y, int(y)+3):
- bitmap[j * ds_max_x + i] = colors[10] # black
-
- # Fill this square with the color
- for i in range(max(0, int(y)-2), min(ds_max_y, int(y)+3)):
- for j in range(max(0, int(x)-2), min(ds_max_x, int(x)+3)):
- bitmap[i * ds_max_x + j] = color
-
- bitmap = [(255,255,255)] * ds_max_x * ds_max_y
-
- i = 0
- for centroid, points in clusters:
- # display points
- for p in points:
- bitmap[p[1] * ds_max_x + p[0]] = colors[i]
- # display centroid
- display_centroid(bitmap, centroid[0], centroid[1], colors[i])
- i += 1
-
- img = Image.new("RGB", (ds_max_x, ds_max_y))
- img.putdata(bitmap)
- return img
-
-def parse_args(argv):
- """
- Interpret the command line
- """
- try:
- opts, args = getopt.getopt(argv, "ho:rn:",
- ["regen", "help", "output-file=", "nb-groups="])
- except getopt.GetOptError:
- usage()
- sys.exit(2)
-
- regen = False
- nb_groups = 0
- output_file = default_output_file
- for opt, arg in opts:
- if opt in ("-h", "--help"):
- usage()
- sys.exit(0)
- elif opt in ("-o", "--output-file"):
- output_file = arg
- elif opt in ("-r", "--regen"):
- regen = True
- elif opt in ("-n", "--nb-groups"):
- nb_groups = arg
-
- return regen, nb_groups, output_file
-
-def generate_output(output_file, clusters_set):
- """
- Display all the clustering results on a single image
- """
- def add_title(img, title):
- draw = ImageDraw.Draw(img)
- draw.text((10, 10), description, fill=colors[10]) # black
-
- result_img = Image.new("RGB", (ds_max_x * len(clusters_set), ds_max_y))
-
- i = 0
- for clusters, description in clusters_set:
- tmp_img = export_to_png(clusters)
- add_title(tmp_img, description)
- result_img.paste(tmp_img, (i * (ds_max_x + 1), 0))
- i += 1
- result_img.save(output_file)
-
-def print_line(line):
- """
- Same as print, but allows to rewrite at the end of the line
- """
- print(line, end = "")
- sys.stdout.flush()
-
-def count_points(clusters):
- """
- Counts the points in a cluster set
- """
- nb_points = 0
- for c in clusters:
- nb_points += len(c[1])
- return nb_points
-
-def usage():
- print("""
-Usage:
- ./k-means_test.py -o output_file.png -n 4 -r
-
-Options:
- -o, --output-file output_file.png:
- The resulting PNG image.
- -r, --regen:
- Generate new points. You should use it at your first run.
- -n, --nb-groups n:
- Generate n groups of points. If not generating points, classify in n
- clusters.
- -h, --help:
- Display this help message.
-""")
-
-def main(args):
- regen, nb_groups, output_file = parse_args(args)
-
- if(regen):
- nb_groups = random.randint(2, ds_max_groups)
- print("Creating test table...")
- create_test_table()
- print_line("Generating random data... ")
- start = time.time()
- original_clusters = (insert_random_data(nb_groups), "Original clustering")
- finish = time.time()
-
- # nb_points = 0
- # for cluster in original_clusters[0]:
- # nb_points += len(cluster[1])
- print("Generated " + str(count_points(original_clusters[0])) + " points partitioned into " +
- str(len(original_clusters[0])) + " clusters in " +
- str(finish - start)[:6] + " seconds.")
- else:
- try:
- print_line("Loading data from " + data_file + "... ")
- start = time.time()
- data_dump = open(data_file, "rb")
- nb_groups = pickle.load(data_dump)
- original_clusters = (pickle.load(data_dump), "Original clustering")
- data_dump.close
- finish = time.time()
-
- print("Data loaded in " + str(finish - start)[:5] + " seconds.")
- except FileNotFoundError:
- print("Cannot load data, you need to generate some data first. Use --regen argument.")
- exit(3)
-
- # k-means clustering
- print_line("Clustering data using k-means algorithm... ")
- start = time.time()
- kmeans_clusters = (apply_clustering_kmeans(nb_groups), "K-means clustering")
- finish = time.time()
- print("Data clustered in " + str(finish - start)[:5] + " seconds.")
-
- # k-means++ clustering
- print_line("Clustering data using k-means++ algorithm... ")
- start = time.time()
- kmeanspp_clusters = (apply_clustering_kmeanspp(nb_groups), "K-means++ clustering")
- finish = time.time()
- print("Data clustered in " + str(finish - start)[:5] + " seconds.")
-
- # output generation
- print_line("Exporting to " + output_file + "...")
- start = time.time()
- generate_output(output_file, [ original_clusters, kmeans_clusters, kmeanspp_clusters])
- finish = time.time()
- print("File generated in " + str(finish - start)[:5] + " seconds.")
-
- print("Done.")
-
-if(__name__ == "__main__"):
- main(sys.argv[1:])
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers