This is an automated email from the ASF dual-hosted git repository. charlie pushed a commit to branch lib-comparison in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
commit 52d43105be93b55ffb6052dbbb6b53b3cde5adcf Author: Charlie Dickens <[email protected]> AuthorDate: Wed Nov 22 12:23:49 2023 +0000 Added update time comparison --- .../update-time-comparison.ipynb | 692 +++++++++++++++++++++ 1 file changed, 692 insertions(+) diff --git a/jupyter/comparison-to-datasketch/update-time-comparison.ipynb b/jupyter/comparison-to-datasketch/update-time-comparison.ipynb new file mode 100644 index 0000000..d66a6e8 --- /dev/null +++ b/jupyter/comparison-to-datasketch/update-time-comparison.ipynb @@ -0,0 +1,692 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0bea40e1-297c-4ccd-a24d-4d2ac631683c", + "metadata": {}, + "source": [ + "# Update Time Comparison\n", + "\n", + "We compare the update time between the Apache DataSketches (ASF) HyperLogLog implementations alongside the `datasketch` HyperLogLogPlusPlus implementation. The same hash function is used for both libraries. We compare $4, 6, 8$ bit versions of the ASF implementations, showing that they have roughly equivalent update times which are all faster than the `datasketch` algorithm (which only provides support for $8$ bit HyperLogLog). We compare the update time when the input is the sam [...] + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e5091679-86a1-48ae-a5d5-3a0be9dc2d84", + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import argparse\n", + "from datetime import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "from utils import distinct_number_sequence\n", + "import datasketches as ds\n", + "import datasketch as d\n", + "import mmh3\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "from timeit import default_timer\n", + "import warnings\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dd7a7982-dc0b-4275-ba10-bbe02d214ea3", + "metadata": {}, + "outputs": [], + "source": [ + "class UpdateTimeProfile:\n", + " \"\"\"Generates an experiment evaluating the update time for different cardinality inputs\"\"\"\n", + " def __init__(self, sketch_lgk:int, lg_trials:int, max_lgN:int):\n", + " self.sketch_lgk = sketch_lgk\n", + " self.num_trials = 2**lg_trials\n", + " self.max_lgN = max_lgN\n", + " self.max_num_distincts = np.uint64(2 ** self.max_lgN)\n", + " self.directory_name = \"hll_update_time_profile_\"\n", + " if not os.path.exists(self.directory_name):\n", + " os.mkdir(self.directory_name)\n", + " self.file_extension = \"_\" + datetime.today().strftime('%H%M') + f\"lgK_{self.sketch_lgk}_lgT_{lg_trials}\"\n", + "\n", + " # Need to remove repeated items for the program logic in self.run()\n", + " self.plot_points = self._generate_plot_points()\n", + " self.plot_points.extend(self._generate_plot_points())\n", + " self.plot_points = list(set(self.plot_points))\n", + " self.plot_points.sort()\n", + " print(f\"Testing {len(self.plot_points)} points with {self.num_trials} trials for average update times:\")\n", + " print(self.plot_points)\n", + "\n", + " # Initialise the data structures for results; one for each method tested 8, 6, datasketch\n", + " # np.ndarrays\n", + " self.DataSketches_results_arr = np.zeros((len(self.plot_points), self.num_trials), dtype=float)\n", + " self.DataSketches_results_arr_6 = np.zeros_like(self.DataSketches_results_arr)\n", + " self.DataSketches_results_arr_4 = np.zeros_like(self.DataSketches_results_arr)\n", + " self.datasketch_results_arr = np.zeros_like(self.DataSketches_results_arr)\n", + "\n", + " # pd.DataFrames\n", + " self.DataSketches_results_df = pd.DataFrame(index=self.plot_points, columns=None)\n", + " self.DataSketches_results_df_6 = pd.DataFrame(index=self.plot_points, columns=None)\n", + " self.DataSketches_results_df_4 = pd.DataFrame(index=self.plot_points, columns=None)\n", + " self.datasketch_results_df = pd.DataFrame(index=self.plot_points, columns=None)\n", + " \n", + " \n", + "\n", + " def _generate_plot_points(self) -> list:\n", + " \"\"\"\n", + " Generates the standard sequence defining the input cardinalites for the experiment\n", + " This is just two points at each power of 2\n", + " \"\"\"\n", + " all_plot_points = []\n", + " for lgk in range(1, self.max_lgN+1):\n", + " points = np.unique(np.logspace(start=lgk, stop=lgk+1, num=4, endpoint=False, base=2, dtype=np.uint64))\n", + " all_plot_points.extend(points)\n", + " all_plot_points.sort()\n", + " return all_plot_points\n", + "\n", + " def _is_power_of_two(self, a:np.uint64) -> bool:\n", + " \"\"\"Bitwise operations to check value a is a power of two\"\"\"\n", + " return (a & (a-1) == 0) and a != 0\n", + "\n", + " def _results_to_df(self, start_:int, end_:int, arr:np.array, df:pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"Concatenates the array between columns start_,...end_ - 1 to the dataframe\"\"\"\n", + " new_df = pd.DataFrame(arr[:, start_:end_], index=df.index, columns=np.arange(start_, end_).tolist())\n", + " print(\"concatenating: \", new_df)\n", + " concat_df = pd.concat([df, new_df], axis=1)\n", + " return concat_df\n", + "\n", + " def run(self) -> None:\n", + " \"\"\"Runs the experiment and writes the files every power of two trials.\"\"\"\n", + " seq_start = np.uint64(2345234)\n", + " distinct_number = np.uint64(3462)\n", + " previous_log_trial_index = 0\n", + " ds_all_results = np.zeros((self.num_trials, len(self.plot_points)))\n", + " ds_all_results_6 = np.zeros_like(ds_all_results)\n", + " ds_all_results_4 = np.zeros_like(ds_all_results)\n", + " d_all_results = np.zeros_like(ds_all_results)\n", + "\n", + " for trial in range(1, self.num_trials+1):\n", + "\n", + " # Initialise the sketches\n", + " hll = ds.hll_sketch(self.sketch_lgk, ds.HLL_8)\n", + " hll6 = ds.hll_sketch(self.sketch_lgk, ds.HLL_6)\n", + " hll4 = ds.hll_sketch(self.sketch_lgk, ds.HLL_4)\n", + " h = d.HyperLogLogPlusPlus(p=self.sketch_lgk, hashfunc=lambda x: mmh3.hash64(x, signed=False)[0])\n", + " plot_point_index = 0 # Return to the start of the plot points list to generate the data\n", + " plot_point_value = self.plot_points[plot_point_index]\n", + " total_updates = 0\n", + " seq_start += distinct_number # Start a new input sequence\n", + "\n", + " # Temporary result data structure\n", + " ds_results = np.zeros((len(self.plot_points),))\n", + " ds_results_6 = np.zeros_like(ds_results)\n", + " ds_results_4 = np.zeros_like(ds_results)\n", + " d_results = np.zeros_like(ds_results)\n", + "\n", + "\n", + " for new_number in distinct_number_sequence(seq_start):\n", + " d_input = new_number.tobytes()\n", + " \n", + " hll8_start = default_timer()\n", + " hll.update(d_input)\n", + " hll8_end = default_timer()\n", + "\n", + " hll6_start = default_timer()\n", + " hll6.update(d_input)\n", + " hll6_end = default_timer()\n", + "\n", + " hll4_start = default_timer()\n", + " hll4.update(d_input)\n", + " hll4_end = default_timer()\n", + "\n", + " d_start = default_timer()\n", + " h.update(d_input)\n", + " d_end = default_timer()\n", + " total_updates += 1\n", + " \n", + " if total_updates == plot_point_value:\n", + " ds_results[plot_point_index] = hll8_end - hll8_start\n", + " ds_results_6[plot_point_index] = hll6_end - hll6_start\n", + " ds_results_4[plot_point_index] = hll4_end - hll4_start\n", + " d_results[plot_point_index] = d_end - d_start\n", + " plot_point_index += 1\n", + "\n", + " if plot_point_index < len(self.plot_points):\n", + " plot_point_value = self.plot_points[plot_point_index]\n", + " else:\n", + " break\n", + "\n", + " # After the break statement, control returns here. Now need to decide whether to write or continue.\n", + " # subtract 1 as we use 1-based indexing for the trial count.\n", + " ds_all_results[trial-1, :] = ds_results \n", + " ds_all_results_6[trial-1, :] = ds_results_6 \n", + " ds_all_results_4[trial-1, :] = ds_results_4\n", + " d_all_results[trial - 1, :] = d_results \n", + " if self._is_power_of_two(trial) and trial > 1:\n", + " # write the array only a logarithmic number of times\n", + " temporary_ds_results = ds_all_results[0:trial, : ]\n", + " temporary_ds_results_6 = ds_all_results_6[0:trial, : ]\n", + " temporary_ds_results_4 = ds_all_results_4[0:trial, : ]\n", + " temporary_d_results = d_all_results[0:trial, :]\n", + " print(f\"#################### PARTIAL RESULTS FOR {trial} TRIALS: DATASKETCHES ####################\")\n", + " previous_log_trial_index = trial\n", + "\n", + " # Write 8 bit results\n", + " self.DataSketches_results_df = pd.DataFrame(temporary_ds_results.T, \n", + " index=self.DataSketches_results_df.index, \n", + " columns=np.arange(trial).tolist())\n", + " self.DataSketches_results_df.to_csv(\n", + " self.directory_name + \"/DataSketches_hll\" + self.file_extension + f\"trials_{trial}_8_bit.csv\",\n", + " index_label=\"n\")\n", + "\n", + " # Write 6 bit results\n", + " self.DataSketches_results_df_6 = pd.DataFrame(temporary_ds_results_6.T, \n", + " index=self.DataSketches_results_df_6.index, \n", + " columns=np.arange(trial).tolist())\n", + " self.DataSketches_results_df_6.to_csv(\n", + " self.directory_name + \"/DataSketches_hll\" + self.file_extension + f\"trials_{trial}_6_bit.csv\",\n", + " index_label=\"n\")\n", + "\n", + " # Write 4 bit results\n", + " self.DataSketches_results_df_4 = pd.DataFrame(temporary_ds_results_4.T, \n", + " index=self.DataSketches_results_df_4.index, \n", + " columns=np.arange(trial).tolist())\n", + " self.DataSketches_results_df_4.to_csv(\n", + " self.directory_name + \"/DataSketches_hll\" + self.file_extension + f\"trials_{trial}_4_bit.csv\",\n", + " index_label=\"n\")\n", + "\n", + " # Write datasketch results\n", + " self.datasketch_results_df = pd.DataFrame(temporary_d_results.T,\n", + " index=self.datasketch_results_df.index,\n", + " columns=np.arange(trial).tolist())\n", + " self.datasketch_results_df.to_csv(\n", + " self.directory_name + \"/datasketch_hll\" + self.file_extension + f\"trials_{trial}.csv\",\n", + " index_label=\"n\"\n", + " )\n", + " print(self.DataSketches_results_df)" + ] + }, + { + "cell_type": "markdown", + "id": "4e7f5276-733e-4460-9076-cfeb50f844bb", + "metadata": {}, + "source": [ + "The experiment times all updates and saves the update time for particluar test points that are fixed for all trials. The median of the written results are plotted and we evaluate the speedup between the libraries.\n", + "\n", + "We set the default experimental parameters below. If you need the example to run more quickly then you can run fewer trials. Using \n", + "\n", + "```SKETCH_LGK = 12, LG_TRIALS = 5, MAX_LG_N = 21``` \n", + "\n", + "takes about $10$ minutes to run on a 2023 MacBook Pro with an Apple M1 Pro chip running Ventura 13.5.2. Absolute timings may vary slightly on different hardware. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "22a78282-42b8-4f03-b109-f1ab232e565f", + "metadata": {}, + "outputs": [], + "source": [ + "SKETCH_LGK = 12\n", + "LG_TRIALS = 5\n", + "MAX_LG_N = 21" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e68e3037-0456-4cb8-bd6f-d0943e32d3cc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing 81 points with 32 trials for average update times:\n", + "[2, 3, 4, 5, 6, 8, 9, 11, 13, 16, 19, 22, 26, 32, 38, 45, 53, 64, 76, 90, 107, 128, 152, 181, 215, 256, 304, 362, 430, 512, 608, 724, 861, 1024, 1217, 1448, 1722, 2048, 2435, 2896, 3444, 4096, 4870, 5792, 6888, 8192, 9741, 11585, 13777, 16384, 19483, 23170, 27554, 32768, 38967, 46340, 55108, 65536, 77935, 92681, 110217, 131072, 155871, 185363, 220435, 262144, 311743, 370727, 440871, 524288, 623487, 741455, 881743, 1048576, 1246974, 1482910, 1763487, 2097152, 2493948, 2965820, 3526975]\n" + ] + } + ], + "source": [ + "update_experiment = UpdateTimeProfile(SKETCH_LGK, LG_TRIALS, MAX_LG_N)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "53db0788-822e-4bba-af89-0071da57d282", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27: RuntimeWarning: overflow encountered in scalar add\n", + " num += np.uint64(golden_ratio)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#################### PARTIAL RESULTS FOR 2 TRIALS: DATASKETCHES ####################\n", + " 0 1\n", + "2 1.166001e-06 7.089984e-07\n", + "3 1.162500e-05 7.499984e-07\n", + "4 1.000000e-06 3.583002e-06\n", + "5 8.329989e-07 7.499984e-07\n", + "6 8.749994e-07 7.499984e-07\n", + "... ... ...\n", + "1763487 7.079980e-07 7.079980e-07\n", + "2097152 7.079980e-07 7.090021e-07\n", + "2493948 7.499984e-07 7.500021e-07\n", + "2965820 7.089984e-07 7.089984e-07\n", + "3526975 6.669979e-07 7.079980e-07\n", + "\n", + "[81 rows x 2 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27: RuntimeWarning: overflow encountered in scalar add\n", + " num += np.uint64(golden_ratio)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#################### PARTIAL RESULTS FOR 4 TRIALS: DATASKETCHES ####################\n", + " 0 1 2 3\n", + "2 1.166001e-06 7.089984e-07 9.159994e-07 7.499984e-07\n", + "3 1.162500e-05 7.499984e-07 1.375000e-06 7.909985e-07\n", + "4 1.000000e-06 3.583002e-06 7.919989e-07 7.080016e-07\n", + "5 8.329989e-07 7.499984e-07 7.919989e-07 7.499984e-07\n", + "6 8.749994e-07 7.499984e-07 7.499984e-07 7.089984e-07\n", + "... ... ... ... ...\n", + "1763487 7.079980e-07 7.079980e-07 7.499984e-07 7.080016e-07\n", + "2097152 7.079980e-07 7.090021e-07 7.080016e-07 7.079980e-07\n", + "2493948 7.499984e-07 7.500021e-07 7.089984e-07 7.089984e-07\n", + "2965820 7.089984e-07 7.089984e-07 6.669979e-07 7.090021e-07\n", + "3526975 6.669979e-07 7.079980e-07 7.079980e-07 7.080016e-07\n", + "\n", + "[81 rows x 4 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27: RuntimeWarning: overflow encountered in scalar add\n", + " num += np.uint64(golden_ratio)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#################### PARTIAL RESULTS FOR 8 TRIALS: DATASKETCHES ####################\n", + " 0 1 2 3 4 \\\n", + "2 1.166001e-06 7.089984e-07 9.159994e-07 7.499984e-07 8.339994e-07 \n", + "3 1.162500e-05 7.499984e-07 1.375000e-06 7.909985e-07 1.249999e-06 \n", + "4 1.000000e-06 3.583002e-06 7.919989e-07 7.080016e-07 7.909985e-07 \n", + "5 8.329989e-07 7.499984e-07 7.919989e-07 7.499984e-07 7.500021e-07 \n", + "6 8.749994e-07 7.499984e-07 7.499984e-07 7.089984e-07 7.500021e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.079980e-07 7.079980e-07 7.499984e-07 7.080016e-07 7.089984e-07 \n", + "2097152 7.079980e-07 7.090021e-07 7.080016e-07 7.079980e-07 7.079980e-07 \n", + "2493948 7.499984e-07 7.500021e-07 7.089984e-07 7.089984e-07 7.079980e-07 \n", + "2965820 7.089984e-07 7.089984e-07 6.669979e-07 7.090021e-07 7.500021e-07 \n", + "3526975 6.669979e-07 7.079980e-07 7.079980e-07 7.080016e-07 6.669979e-07 \n", + "\n", + " 5 6 7 \n", + "2 1.375000e-06 7.080016e-07 7.499984e-07 \n", + "3 7.500021e-07 7.910021e-07 7.089984e-07 \n", + "4 7.499984e-07 7.089984e-07 7.080016e-07 \n", + "5 7.499984e-07 6.670016e-07 7.090021e-07 \n", + "6 7.089984e-07 7.080016e-07 7.089984e-07 \n", + "... ... ... ... \n", + "1763487 7.079980e-07 7.499984e-07 7.090021e-07 \n", + "2097152 6.659975e-07 7.080016e-07 7.080016e-07 \n", + "2493948 7.079980e-07 7.090021e-07 6.660011e-07 \n", + "2965820 7.079980e-07 6.670016e-07 6.670016e-07 \n", + "3526975 6.670016e-07 7.089984e-07 7.090021e-07 \n", + "\n", + "[81 rows x 8 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27: RuntimeWarning: overflow encountered in scalar add\n", + " num += np.uint64(golden_ratio)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#################### PARTIAL RESULTS FOR 16 TRIALS: DATASKETCHES ####################\n", + " 0 1 2 3 4 \\\n", + "2 1.166001e-06 7.089984e-07 9.159994e-07 7.499984e-07 8.339994e-07 \n", + "3 1.162500e-05 7.499984e-07 1.375000e-06 7.909985e-07 1.249999e-06 \n", + "4 1.000000e-06 3.583002e-06 7.919989e-07 7.080016e-07 7.909985e-07 \n", + "5 8.329989e-07 7.499984e-07 7.919989e-07 7.499984e-07 7.500021e-07 \n", + "6 8.749994e-07 7.499984e-07 7.499984e-07 7.089984e-07 7.500021e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.079980e-07 7.079980e-07 7.499984e-07 7.080016e-07 7.089984e-07 \n", + "2097152 7.079980e-07 7.090021e-07 7.080016e-07 7.079980e-07 7.079980e-07 \n", + "2493948 7.499984e-07 7.500021e-07 7.089984e-07 7.089984e-07 7.079980e-07 \n", + "2965820 7.089984e-07 7.089984e-07 6.669979e-07 7.090021e-07 7.500021e-07 \n", + "3526975 6.669979e-07 7.079980e-07 7.079980e-07 7.080016e-07 6.669979e-07 \n", + "\n", + " 5 6 7 8 9 \\\n", + "2 1.375000e-06 7.080016e-07 7.499984e-07 7.919989e-07 7.500021e-07 \n", + "3 7.500021e-07 7.910021e-07 7.089984e-07 1.332999e-06 7.499984e-07 \n", + "4 7.499984e-07 7.089984e-07 7.080016e-07 8.330026e-07 2.666999e-06 \n", + "5 7.499984e-07 6.670016e-07 7.090021e-07 7.909985e-07 7.079980e-07 \n", + "6 7.089984e-07 7.080016e-07 7.089984e-07 7.909985e-07 7.499984e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.079980e-07 7.499984e-07 7.090021e-07 7.080016e-07 7.090021e-07 \n", + "2097152 6.659975e-07 7.080016e-07 7.080016e-07 7.500021e-07 7.090021e-07 \n", + "2493948 7.079980e-07 7.090021e-07 6.660011e-07 7.500021e-07 7.499984e-07 \n", + "2965820 7.079980e-07 6.670016e-07 6.670016e-07 7.080016e-07 7.089984e-07 \n", + "3526975 6.670016e-07 7.089984e-07 7.090021e-07 7.079980e-07 6.660011e-07 \n", + "\n", + " 10 11 12 13 14 \\\n", + "2 7.090021e-07 8.749994e-07 7.499984e-07 7.089984e-07 7.080016e-07 \n", + "3 7.080016e-07 7.079980e-07 7.919989e-07 7.920025e-07 7.500021e-07 \n", + "4 7.080016e-07 8.750030e-07 7.080016e-07 2.790999e-06 7.079980e-07 \n", + "5 7.500021e-07 7.090021e-07 7.080016e-07 7.499984e-07 7.079980e-07 \n", + "6 7.089984e-07 7.080016e-07 7.079980e-07 7.080016e-07 7.080016e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.499984e-07 7.079980e-07 7.090021e-07 7.079980e-07 7.089984e-07 \n", + "2097152 7.079980e-07 7.090021e-07 7.079980e-07 7.089984e-07 7.090021e-07 \n", + "2493948 7.079980e-07 7.089984e-07 6.669979e-07 7.089984e-07 7.080016e-07 \n", + "2965820 7.089984e-07 7.499984e-07 6.669979e-07 6.669979e-07 7.080016e-07 \n", + "3526975 7.080016e-07 6.669979e-07 6.670016e-07 7.080016e-07 7.499984e-07 \n", + "\n", + " 15 \n", + "2 7.500021e-07 \n", + "3 7.089984e-07 \n", + "4 2.834000e-06 \n", + "5 7.499984e-07 \n", + "6 7.499984e-07 \n", + "... ... \n", + "1763487 7.079980e-07 \n", + "2097152 7.500021e-07 \n", + "2493948 6.670016e-07 \n", + "2965820 7.090021e-07 \n", + "3526975 6.670016e-07 \n", + "\n", + "[81 rows x 16 columns]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27: RuntimeWarning: overflow encountered in scalar add\n", + " num += np.uint64(golden_ratio)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "#################### PARTIAL RESULTS FOR 32 TRIALS: DATASKETCHES ####################\n", + " 0 1 2 3 4 \\\n", + "2 1.166001e-06 7.089984e-07 9.159994e-07 7.499984e-07 8.339994e-07 \n", + "3 1.162500e-05 7.499984e-07 1.375000e-06 7.909985e-07 1.249999e-06 \n", + "4 1.000000e-06 3.583002e-06 7.919989e-07 7.080016e-07 7.909985e-07 \n", + "5 8.329989e-07 7.499984e-07 7.919989e-07 7.499984e-07 7.500021e-07 \n", + "6 8.749994e-07 7.499984e-07 7.499984e-07 7.089984e-07 7.500021e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.079980e-07 7.079980e-07 7.499984e-07 7.080016e-07 7.089984e-07 \n", + "2097152 7.079980e-07 7.090021e-07 7.080016e-07 7.079980e-07 7.079980e-07 \n", + "2493948 7.499984e-07 7.500021e-07 7.089984e-07 7.089984e-07 7.079980e-07 \n", + "2965820 7.089984e-07 7.089984e-07 6.669979e-07 7.090021e-07 7.500021e-07 \n", + "3526975 6.669979e-07 7.079980e-07 7.079980e-07 7.080016e-07 6.669979e-07 \n", + "\n", + " 5 6 7 8 9 \\\n", + "2 1.375000e-06 7.080016e-07 7.499984e-07 7.919989e-07 7.500021e-07 \n", + "3 7.500021e-07 7.910021e-07 7.089984e-07 1.332999e-06 7.499984e-07 \n", + "4 7.499984e-07 7.089984e-07 7.080016e-07 8.330026e-07 2.666999e-06 \n", + "5 7.499984e-07 6.670016e-07 7.090021e-07 7.909985e-07 7.079980e-07 \n", + "6 7.089984e-07 7.080016e-07 7.089984e-07 7.909985e-07 7.499984e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.079980e-07 7.499984e-07 7.090021e-07 7.080016e-07 7.090021e-07 \n", + "2097152 6.659975e-07 7.080016e-07 7.080016e-07 7.500021e-07 7.090021e-07 \n", + "2493948 7.079980e-07 7.090021e-07 6.660011e-07 7.500021e-07 7.499984e-07 \n", + "2965820 7.079980e-07 6.670016e-07 6.670016e-07 7.080016e-07 7.089984e-07 \n", + "3526975 6.670016e-07 7.089984e-07 7.090021e-07 7.079980e-07 6.660011e-07 \n", + "\n", + " ... 22 23 24 25 \\\n", + "2 ... 7.500021e-07 7.919989e-07 7.080016e-07 7.079980e-07 \n", + "3 ... 7.919989e-07 7.920025e-07 7.499984e-07 7.909985e-07 \n", + "4 ... 7.500021e-07 7.500021e-07 7.080016e-07 7.089984e-07 \n", + "5 ... 7.499984e-07 7.080016e-07 7.089984e-07 7.500021e-07 \n", + "6 ... 7.499984e-07 7.089984e-07 7.079980e-07 7.080016e-07 \n", + "... ... ... ... ... ... \n", + "1763487 ... 3.292000e-06 7.499984e-07 7.080016e-07 7.089984e-07 \n", + "2097152 ... 7.080016e-07 7.500021e-07 7.080016e-07 7.079980e-07 \n", + "2493948 ... 7.089984e-07 7.089984e-07 6.670016e-07 7.090021e-07 \n", + "2965820 ... 6.670016e-07 6.669979e-07 7.089984e-07 7.500021e-07 \n", + "3526975 ... 7.080016e-07 7.080016e-07 6.670016e-07 7.500021e-07 \n", + "\n", + " 26 27 28 29 30 \\\n", + "2 7.499984e-07 7.500021e-07 7.499984e-07 7.089984e-07 7.499984e-07 \n", + "3 7.500021e-07 7.499984e-07 7.920025e-07 7.079980e-07 7.920025e-07 \n", + "4 7.079980e-07 7.080016e-07 7.500021e-07 7.500021e-07 2.624998e-06 \n", + "5 7.499984e-07 7.500021e-07 7.500021e-07 8.329989e-07 7.079980e-07 \n", + "6 7.079980e-07 7.089984e-07 7.500021e-07 7.499984e-07 7.080016e-07 \n", + "... ... ... ... ... ... \n", + "1763487 7.919989e-07 7.079980e-07 7.090021e-07 7.079980e-07 7.089984e-07 \n", + "2097152 7.080016e-07 7.079980e-07 7.499984e-07 7.089984e-07 6.669979e-07 \n", + "2493948 7.080016e-07 7.079980e-07 7.079980e-07 7.499984e-07 6.670016e-07 \n", + "2965820 7.079980e-07 6.669979e-07 7.079980e-07 6.670016e-07 7.079980e-07 \n", + "3526975 7.080016e-07 7.090021e-07 7.080016e-07 7.089984e-07 7.090021e-07 \n", + "\n", + " 31 \n", + "2 7.080016e-07 \n", + "3 7.500021e-07 \n", + "4 7.080016e-07 \n", + "5 7.499984e-07 \n", + "6 7.910021e-07 \n", + "... ... \n", + "1763487 6.660011e-07 \n", + "2097152 7.500021e-07 \n", + "2493948 7.090021e-07 \n", + "2965820 6.669979e-07 \n", + "3526975 7.079980e-07 \n", + "\n", + "[81 rows x 32 columns]\n", + "CPU times: user 8min 58s, sys: 1.78 s, total: 9min\n", + "Wall time: 9min 1s\n" + ] + } + ], + "source": [ + "%%time\n", + "update_experiment.run()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "54db1c44-bc9a-4b02-93da-c96190669bf3", + "metadata": {}, + "outputs": [], + "source": [ + "asf8 = pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_8_bit.csv\", index_col=0)\n", + "asf6 = pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_6_bit.csv\", index_col=0)\n", + "asf4 = pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_4_bit.csv\", index_col=0)\n", + "dsk = pd.read_csv(f\"hll_update_time_profile_/datasketch_hll{update_experiment.file_extension}trials_32.csv\", index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "39a1070c-5b00-43a4-85d2-76711aa99084", + "metadata": {}, + "outputs": [], + "source": [ + "# Plotting parameters\n", + "method_plot_params = {\n", + " \"asf8\" : {\"color\": \"C0\", \"marker\": '.'},\n", + " \"asf6\" : {\"color\": \"C2\", \"marker\": '+'},\n", + " \"asf4\" : {\"color\": \"C3\", \"marker\": 'd'},\n", + " \"datasketch\" : {\"color\": \"C1\", \"marker\": \"^\"}\n", + "}\n", + "asf8_color = method_plot_params[\"asf8\"][\"color\"]\n", + "asf6_color = method_plot_params[\"asf6\"][\"color\"]\n", + "asf4_color = method_plot_params[\"asf4\"][\"color\"]\n", + "ds_color = method_plot_params[\"datasketch\"][\"color\"]\n", + "q90_ls = \"--\"\n", + "\n", + "params = {'legend.fontsize': 'x-large',\n", + " 'axes.labelsize': 'x-large',\n", + " 'axes.titlesize':'x-large',\n", + " 'xtick.labelsize':'x-large',\n", + " 'ytick.labelsize':'x-large',\n", + " \"lines.linewidth\": 2.5}\n", + "plt.rcParams.update(params)" + ] + }, + { + "cell_type": "markdown", + "id": "6c340f56-6301-4fcf-826e-c3962c93683d", + "metadata": {}, + "source": [ + "We plot on the $y$-axis the median update time (in microseconds) and $x$-axis the input cardinality on a $\\log_{10}$ scale.\n", + "As expected, for both libraries, we see that the update time for a single item is essentially constant as $n$ increases, barring a small number of minor blips." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "31f1125d-e5ea-41af-abf3-d99a8ffb87b7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(6e-07, 1.6e-06)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAA/gAAAHHCAYAAAALaPPGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAC450lEQVR4nOzdd3hUVfoH8O+dkt4TSkhIoYTeBBHpRUFRQMQOQhR7QcTdtS1Ll9WfyoJlXUBpoqsLGoqIdESlQ2hSUwgJJZDeM+X+/rjMZO6UZFqSSfh+nicPmTt37pwhZ2bue8973iOIoiiCiIiIiIiIiBo0RX03gIiIiIiIiIhcxwCfiIiIiIiIqBFggE9ERERERETUCDDAJyIiIiIiImoEGOATERERERERNQIM8ImIiIiIiIgaAQb4RERERERERI0AA3wiIiIiIiKiRkBV3w0gIgIAjUYDnU5X380gIiIiFyiV [...] + "text/plain": [ + "<Figure size 1200x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12,4))\n", + "\n", + "methods = [\"ASF8\", \"ASF6\", \"ASF4\", \"datasketch\"]\n", + "\n", + "for i, (method, colour, df) in enumerate(zip(methods, [asf8_color, asf6_color, asf4_color, ds_color], [asf8, asf6, asf4, dsk])):\n", + " xn = df.index \n", + " median = df.median(axis=1)\n", + " ax.plot(xn, median,\n", + " color=colour, label=method+\": median\", alpha=0.5)\n", + "\n", + "ax.set_xscale('log', base=10)\n", + "ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),\n", + " ncol=2, fancybox=True)\n", + "ax.grid()\n", + "ax.set_ylabel(\"Median Update time\")\n", + "ax.set_xlabel(r\"Input cardinality $n$\")\n", + "ax.set_ylim(0.6E-6, 1.6E-6)" + ] + }, + { + "cell_type": "markdown", + "id": "5d969c60-4df4-4ee6-a570-604a5a68e655", + "metadata": {}, + "source": [ + "We evaluate the factor speedup per update. This is defined as the ratio of the total time per update required in using the datasketch library compred to the ASF implementation. We see that the datasketch library generally takes twice as much time per update compared to the ASF implementation. Since there is essentially no difference between the $4, 6, 8$ bit versions of ASF HyperLogLog, we only plot the $8$ bit comparison below." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6d68deef-26a8-411c-b0b1-e2182e41b967", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Text(0.5, 0, 'Input cardinality $n$')" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAABAkAAAGECAYAAABUEXkgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB/lUlEQVR4nO3deVxU5f4H8M9sDOuAuKIgiIqiZuaGoqhoWmmZpamlpmWaqeVyb/faKlpdr7f6WVaWZqnZZu6WKyluqLjvIgrijgvKDsMs5/cHMM4wCwMzAwf8vF8vXjDnnDnzHOY7MM93nuf7SARBEEBEREREREREDz1pVTeAiIiIiIiIiMSBSQIiIiIiIiIiAsAkAREREREREREVY5KAiIiIiIiIiAAwSUBERERERERExZgkICIiIiIiIiIATBIQERERERERUTEmCYiIiIiIiIgIACCv6gY8bPR6PW7cuAEfHx9I [...] + "text/plain": [ + "<Figure size 1200x400 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig, ax = plt.subplots(figsize=(12,4))\n", + "\n", + "methods = [\"ASF8\", \"ASF6\", \"ASF4\", \"datasketch\"]\n", + "\n", + "\n", + "xn = asf8.index\n", + "xspeedup = (dsk / asf8).median(axis=1)\n", + "ax.plot(xn, xspeedup)\n", + "\n", + "ax.set_xscale('log', base=10)\n", + "ax.grid()\n", + "ax.set_ylabel(r\"$\\times$ Speedup\")\n", + "ax.set_xlabel(r\"Input cardinality $n$\")\n", + "#ax.set_ylim(0.25E-6, 1.5E-6)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
