(datasketches-python) 01/01: Added update time comparison

charlie Wed, 22 Nov 2023 05:01:11 -0800

This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch lib-comparison
in repository https://gitbox.apache.org/repos/asf/datasketches-python.git


commit 52d43105be93b55ffb6052dbbb6b53b3cde5adcf
Author: Charlie Dickens <[email protected]>
AuthorDate: Wed Nov 22 12:23:49 2023 +0000

    Added update time comparison
---
 .../update-time-comparison.ipynb                   | 692 +++++++++++++++++++++
 1 file changed, 692 insertions(+)

diff --git a/jupyter/comparison-to-datasketch/update-time-comparison.ipynb 
b/jupyter/comparison-to-datasketch/update-time-comparison.ipynb
new file mode 100644
index 0000000..d66a6e8
--- /dev/null
+++ b/jupyter/comparison-to-datasketch/update-time-comparison.ipynb
@@ -0,0 +1,692 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0bea40e1-297c-4ccd-a24d-4d2ac631683c",
+   "metadata": {},
+   "source": [
+    "# Update Time Comparison\n",
+    "\n",
+    "We compare the update time between the Apache DataSketches (ASF) 
HyperLogLog implementations alongside the `datasketch` HyperLogLogPlusPlus 
implementation.  The same hash function is used for both libraries.  We compare 
$4, 6, 8$ bit versions of the ASF implementations, showing that they have 
roughly equivalent update times which are all faster than the `datasketch` 
algorithm (which only provides support for $8$ bit HyperLogLog).  We compare 
the update time when the input is the sam [...]
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "e5091679-86a1-48ae-a5d5-3a0be9dc2d84",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import argparse\n",
+    "from datetime import datetime\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from utils import distinct_number_sequence\n",
+    "import datasketches as ds\n",
+    "import datasketch as d\n",
+    "import mmh3\n",
+    "import os\n",
+    "import matplotlib.pyplot as plt\n",
+    "from timeit import default_timer\n",
+    "import warnings\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dd7a7982-dc0b-4275-ba10-bbe02d214ea3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class UpdateTimeProfile:\n",
+    "    \"\"\"Generates an experiment evaluating the update time for 
different cardinality inputs\"\"\"\n",
+    "    def __init__(self, sketch_lgk:int, lg_trials:int, max_lgN:int):\n",
+    "        self.sketch_lgk = sketch_lgk\n",
+    "        self.num_trials = 2**lg_trials\n",
+    "        self.max_lgN = max_lgN\n",
+    "        self.max_num_distincts = np.uint64(2 ** self.max_lgN)\n",
+    "        self.directory_name = \"hll_update_time_profile_\"\n",
+    "        if not os.path.exists(self.directory_name):\n",
+    "            os.mkdir(self.directory_name)\n",
+    "        self.file_extension = \"_\" + datetime.today().strftime('%H%M') + 
f\"lgK_{self.sketch_lgk}_lgT_{lg_trials}\"\n",
+    "\n",
+    "        # Need to remove repeated items for the program logic in 
self.run()\n",
+    "        self.plot_points = self._generate_plot_points()\n",
+    "        self.plot_points.extend(self._generate_plot_points())\n",
+    "        self.plot_points = list(set(self.plot_points))\n",
+    "        self.plot_points.sort()\n",
+    "        print(f\"Testing {len(self.plot_points)} points with 
{self.num_trials} trials for average update times:\")\n",
+    "        print(self.plot_points)\n",
+    "\n",
+    "        # Initialise the data structures for results; one for each method 
tested 8, 6, datasketch\n",
+    "        # np.ndarrays\n",
+    "        self.DataSketches_results_arr = np.zeros((len(self.plot_points), 
self.num_trials), dtype=float)\n",
+    "        self.DataSketches_results_arr_6 =  
np.zeros_like(self.DataSketches_results_arr)\n",
+    "        self.DataSketches_results_arr_4 =  
np.zeros_like(self.DataSketches_results_arr)\n",
+    "        self.datasketch_results_arr = 
np.zeros_like(self.DataSketches_results_arr)\n",
+    "\n",
+    "        # pd.DataFrames\n",
+    "        self.DataSketches_results_df = 
pd.DataFrame(index=self.plot_points, columns=None)\n",
+    "        self.DataSketches_results_df_6 = 
pd.DataFrame(index=self.plot_points, columns=None)\n",
+    "        self.DataSketches_results_df_4 = 
pd.DataFrame(index=self.plot_points, columns=None)\n",
+    "        self.datasketch_results_df = pd.DataFrame(index=self.plot_points, 
columns=None)\n",
+    "        \n",
+    "        \n",
+    "\n",
+    "    def _generate_plot_points(self) -> list:\n",
+    "        \"\"\"\n",
+    "        Generates the standard sequence defining the input cardinalites 
for the experiment\n",
+    "        This is just two points at each power of 2\n",
+    "        \"\"\"\n",
+    "        all_plot_points = []\n",
+    "        for lgk in range(1, self.max_lgN+1):\n",
+    "            points = np.unique(np.logspace(start=lgk, stop=lgk+1, num=4, 
endpoint=False, base=2, dtype=np.uint64))\n",
+    "            all_plot_points.extend(points)\n",
+    "        all_plot_points.sort()\n",
+    "        return all_plot_points\n",
+    "\n",
+    "    def _is_power_of_two(self, a:np.uint64) -> bool:\n",
+    "        \"\"\"Bitwise operations to check value a is a power of 
two\"\"\"\n",
+    "        return (a & (a-1) == 0) and a != 0\n",
+    "\n",
+    "    def _results_to_df(self, start_:int, end_:int, arr:np.array, 
df:pd.DataFrame) -> pd.DataFrame:\n",
+    "        \"\"\"Concatenates the array between columns start_,...end_ - 1 
to the dataframe\"\"\"\n",
+    "        new_df = pd.DataFrame(arr[:, start_:end_], index=df.index, 
columns=np.arange(start_, end_).tolist())\n",
+    "        print(\"concatenating: \", new_df)\n",
+    "        concat_df = pd.concat([df, new_df], axis=1)\n",
+    "        return concat_df\n",
+    "\n",
+    "    def run(self) -> None:\n",
+    "        \"\"\"Runs the experiment and writes the files every power of two 
trials.\"\"\"\n",
+    "        seq_start = np.uint64(2345234)\n",
+    "        distinct_number = np.uint64(3462)\n",
+    "        previous_log_trial_index = 0\n",
+    "        ds_all_results = np.zeros((self.num_trials, 
len(self.plot_points)))\n",
+    "        ds_all_results_6 = np.zeros_like(ds_all_results)\n",
+    "        ds_all_results_4 = np.zeros_like(ds_all_results)\n",
+    "        d_all_results = np.zeros_like(ds_all_results)\n",
+    "\n",
+    "        for trial in range(1, self.num_trials+1):\n",
+    "\n",
+    "            # Initialise the sketches\n",
+    "            hll = ds.hll_sketch(self.sketch_lgk, ds.HLL_8)\n",
+    "            hll6 = ds.hll_sketch(self.sketch_lgk, ds.HLL_6)\n",
+    "            hll4 = ds.hll_sketch(self.sketch_lgk, ds.HLL_4)\n",
+    "            h = d.HyperLogLogPlusPlus(p=self.sketch_lgk, hashfunc=lambda 
x: mmh3.hash64(x, signed=False)[0])\n",
+    "            plot_point_index = 0  # Return to the start of the plot 
points list to generate the data\n",
+    "            plot_point_value = self.plot_points[plot_point_index]\n",
+    "            total_updates = 0\n",
+    "            seq_start += distinct_number  # Start a new input sequence\n",
+    "\n",
+    "            # Temporary result data structure\n",
+    "            ds_results = np.zeros((len(self.plot_points),))\n",
+    "            ds_results_6 = np.zeros_like(ds_results)\n",
+    "            ds_results_4 = np.zeros_like(ds_results)\n",
+    "            d_results = np.zeros_like(ds_results)\n",
+    "\n",
+    "\n",
+    "            for new_number in distinct_number_sequence(seq_start):\n",
+    "                d_input = new_number.tobytes()\n",
+    "                \n",
+    "                hll8_start = default_timer()\n",
+    "                hll.update(d_input)\n",
+    "                hll8_end = default_timer()\n",
+    "\n",
+    "                hll6_start = default_timer()\n",
+    "                hll6.update(d_input)\n",
+    "                hll6_end = default_timer()\n",
+    "\n",
+    "                hll4_start = default_timer()\n",
+    "                hll4.update(d_input)\n",
+    "                hll4_end = default_timer()\n",
+    "\n",
+    "                d_start = default_timer()\n",
+    "                h.update(d_input)\n",
+    "                d_end = default_timer()\n",
+    "                total_updates += 1\n",
+    "                \n",
+    "                if total_updates == plot_point_value:\n",
+    "                    ds_results[plot_point_index]   =  hll8_end - 
hll8_start\n",
+    "                    ds_results_6[plot_point_index] =  hll6_end - 
hll6_start\n",
+    "                    ds_results_4[plot_point_index] =  hll4_end - 
hll4_start\n",
+    "                    d_results[plot_point_index]    =  d_end - d_start\n",
+    "                    plot_point_index += 1\n",
+    "\n",
+    "                    if plot_point_index < len(self.plot_points):\n",
+    "                        plot_point_value = 
self.plot_points[plot_point_index]\n",
+    "                    else:\n",
+    "                        break\n",
+    "\n",
+    "            # After the break statement, control returns here.  Now need 
to decide whether to write or continue.\n",
+    "            # subtract 1 as we use 1-based indexing for the trial 
count.\n",
+    "            ds_all_results[trial-1, :]   = ds_results \n",
+    "            ds_all_results_6[trial-1, :] = ds_results_6 \n",
+    "            ds_all_results_4[trial-1, :] = ds_results_4\n",
+    "            d_all_results[trial - 1, :]  = d_results  \n",
+    "            if self._is_power_of_two(trial) and trial > 1:\n",
+    "                # write the array only a logarithmic number of times\n",
+    "                temporary_ds_results = ds_all_results[0:trial, : ]\n",
+    "                temporary_ds_results_6 = ds_all_results_6[0:trial, : ]\n",
+    "                temporary_ds_results_4 = ds_all_results_4[0:trial, : ]\n",
+    "                temporary_d_results = d_all_results[0:trial, :]\n",
+    "                print(f\"#################### PARTIAL RESULTS FOR {trial} 
TRIALS: DATASKETCHES ####################\")\n",
+    "                previous_log_trial_index = trial\n",
+    "\n",
+    "                # Write 8 bit results\n",
+    "                self.DataSketches_results_df = 
pd.DataFrame(temporary_ds_results.T, \n",
+    "                                                            
index=self.DataSketches_results_df.index, \n",
+    "                                                            
columns=np.arange(trial).tolist())\n",
+    "                self.DataSketches_results_df.to_csv(\n",
+    "                    self.directory_name + \"/DataSketches_hll\" + 
self.file_extension + f\"trials_{trial}_8_bit.csv\",\n",
+    "                    index_label=\"n\")\n",
+    "\n",
+    "                # Write 6 bit results\n",
+    "                self.DataSketches_results_df_6 = 
pd.DataFrame(temporary_ds_results_6.T, \n",
+    "                                                            
index=self.DataSketches_results_df_6.index, \n",
+    "                                                            
columns=np.arange(trial).tolist())\n",
+    "                self.DataSketches_results_df_6.to_csv(\n",
+    "                    self.directory_name + \"/DataSketches_hll\" + 
self.file_extension + f\"trials_{trial}_6_bit.csv\",\n",
+    "                    index_label=\"n\")\n",
+    "\n",
+    "                 # Write 4 bit results\n",
+    "                self.DataSketches_results_df_4 = 
pd.DataFrame(temporary_ds_results_4.T, \n",
+    "                                                            
index=self.DataSketches_results_df_4.index, \n",
+    "                                                            
columns=np.arange(trial).tolist())\n",
+    "                self.DataSketches_results_df_4.to_csv(\n",
+    "                    self.directory_name + \"/DataSketches_hll\" + 
self.file_extension + f\"trials_{trial}_4_bit.csv\",\n",
+    "                    index_label=\"n\")\n",
+    "\n",
+    "                # Write datasketch results\n",
+    "                self.datasketch_results_df = 
pd.DataFrame(temporary_d_results.T,\n",
+    "                                                            
index=self.datasketch_results_df.index,\n",
+    "                                                            
columns=np.arange(trial).tolist())\n",
+    "                self.datasketch_results_df.to_csv(\n",
+    "                    self.directory_name + \"/datasketch_hll\" + 
self.file_extension + f\"trials_{trial}.csv\",\n",
+    "                    index_label=\"n\"\n",
+    "                )\n",
+    "                print(self.DataSketches_results_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e7f5276-733e-4460-9076-cfeb50f844bb",
+   "metadata": {},
+   "source": [
+    "The experiment times all updates and saves the update time for particluar 
test points that are fixed for all trials.  The median of the written results 
are plotted and we evaluate the speedup between the libraries.\n",
+    "\n",
+    "We set the default experimental parameters below.  If you need the 
example to run more quickly then you can run fewer trials.  Using \n",
+    "\n",
+    "```SKETCH_LGK = 12, LG_TRIALS = 5, MAX_LG_N = 21``` \n",
+    "\n",
+    "takes about $10$ minutes to run on a 2023 MacBook Pro with an Apple M1 
Pro chip running Ventura 13.5.2. Absolute timings may vary slightly on 
different hardware.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "22a78282-42b8-4f03-b109-f1ab232e565f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SKETCH_LGK = 12\n",
+    "LG_TRIALS = 5\n",
+    "MAX_LG_N = 21"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e68e3037-0456-4cb8-bd6f-d0943e32d3cc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing 81 points with 32 trials for average update times:\n",
+      "[2, 3, 4, 5, 6, 8, 9, 11, 13, 16, 19, 22, 26, 32, 38, 45, 53, 64, 76, 
90, 107, 128, 152, 181, 215, 256, 304, 362, 430, 512, 608, 724, 861, 1024, 
1217, 1448, 1722, 2048, 2435, 2896, 3444, 4096, 4870, 5792, 6888, 8192, 9741, 
11585, 13777, 16384, 19483, 23170, 27554, 32768, 38967, 46340, 55108, 65536, 
77935, 92681, 110217, 131072, 155871, 185363, 220435, 262144, 311743, 370727, 
440871, 524288, 623487, 741455, 881743, 1048576, 1246974, 1482910, 1763487, 
2097152, 2493948, 2965820, 3526975]\n"
+     ]
+    }
+   ],
+   "source": [
+    "update_experiment = UpdateTimeProfile(SKETCH_LGK, LG_TRIALS, MAX_LG_N)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "53db0788-822e-4bba-af89-0071da57d282",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      
"/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27:
 RuntimeWarning: overflow encountered in scalar add\n",
+      "  num += np.uint64(golden_ratio)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#################### PARTIAL RESULTS FOR 2 TRIALS: DATASKETCHES 
####################\n",
+      "                    0             1\n",
+      "2        1.166001e-06  7.089984e-07\n",
+      "3        1.162500e-05  7.499984e-07\n",
+      "4        1.000000e-06  3.583002e-06\n",
+      "5        8.329989e-07  7.499984e-07\n",
+      "6        8.749994e-07  7.499984e-07\n",
+      "...               ...           ...\n",
+      "1763487  7.079980e-07  7.079980e-07\n",
+      "2097152  7.079980e-07  7.090021e-07\n",
+      "2493948  7.499984e-07  7.500021e-07\n",
+      "2965820  7.089984e-07  7.089984e-07\n",
+      "3526975  6.669979e-07  7.079980e-07\n",
+      "\n",
+      "[81 rows x 2 columns]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      
"/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27:
 RuntimeWarning: overflow encountered in scalar add\n",
+      "  num += np.uint64(golden_ratio)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#################### PARTIAL RESULTS FOR 4 TRIALS: DATASKETCHES 
####################\n",
+      "                    0             1             2             3\n",
+      "2        1.166001e-06  7.089984e-07  9.159994e-07  7.499984e-07\n",
+      "3        1.162500e-05  7.499984e-07  1.375000e-06  7.909985e-07\n",
+      "4        1.000000e-06  3.583002e-06  7.919989e-07  7.080016e-07\n",
+      "5        8.329989e-07  7.499984e-07  7.919989e-07  7.499984e-07\n",
+      "6        8.749994e-07  7.499984e-07  7.499984e-07  7.089984e-07\n",
+      "...               ...           ...           ...           ...\n",
+      "1763487  7.079980e-07  7.079980e-07  7.499984e-07  7.080016e-07\n",
+      "2097152  7.079980e-07  7.090021e-07  7.080016e-07  7.079980e-07\n",
+      "2493948  7.499984e-07  7.500021e-07  7.089984e-07  7.089984e-07\n",
+      "2965820  7.089984e-07  7.089984e-07  6.669979e-07  7.090021e-07\n",
+      "3526975  6.669979e-07  7.079980e-07  7.079980e-07  7.080016e-07\n",
+      "\n",
+      "[81 rows x 4 columns]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      
"/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27:
 RuntimeWarning: overflow encountered in scalar add\n",
+      "  num += np.uint64(golden_ratio)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#################### PARTIAL RESULTS FOR 8 TRIALS: DATASKETCHES 
####################\n",
+      "                    0             1             2             3         
    4  \\\n",
+      "2        1.166001e-06  7.089984e-07  9.159994e-07  7.499984e-07  
8.339994e-07   \n",
+      "3        1.162500e-05  7.499984e-07  1.375000e-06  7.909985e-07  
1.249999e-06   \n",
+      "4        1.000000e-06  3.583002e-06  7.919989e-07  7.080016e-07  
7.909985e-07   \n",
+      "5        8.329989e-07  7.499984e-07  7.919989e-07  7.499984e-07  
7.500021e-07   \n",
+      "6        8.749994e-07  7.499984e-07  7.499984e-07  7.089984e-07  
7.500021e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.079980e-07  7.079980e-07  7.499984e-07  7.080016e-07  
7.089984e-07   \n",
+      "2097152  7.079980e-07  7.090021e-07  7.080016e-07  7.079980e-07  
7.079980e-07   \n",
+      "2493948  7.499984e-07  7.500021e-07  7.089984e-07  7.089984e-07  
7.079980e-07   \n",
+      "2965820  7.089984e-07  7.089984e-07  6.669979e-07  7.090021e-07  
7.500021e-07   \n",
+      "3526975  6.669979e-07  7.079980e-07  7.079980e-07  7.080016e-07  
6.669979e-07   \n",
+      "\n",
+      "                    5             6             7  \n",
+      "2        1.375000e-06  7.080016e-07  7.499984e-07  \n",
+      "3        7.500021e-07  7.910021e-07  7.089984e-07  \n",
+      "4        7.499984e-07  7.089984e-07  7.080016e-07  \n",
+      "5        7.499984e-07  6.670016e-07  7.090021e-07  \n",
+      "6        7.089984e-07  7.080016e-07  7.089984e-07  \n",
+      "...               ...           ...           ...  \n",
+      "1763487  7.079980e-07  7.499984e-07  7.090021e-07  \n",
+      "2097152  6.659975e-07  7.080016e-07  7.080016e-07  \n",
+      "2493948  7.079980e-07  7.090021e-07  6.660011e-07  \n",
+      "2965820  7.079980e-07  6.670016e-07  6.670016e-07  \n",
+      "3526975  6.670016e-07  7.089984e-07  7.090021e-07  \n",
+      "\n",
+      "[81 rows x 8 columns]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      
"/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27:
 RuntimeWarning: overflow encountered in scalar add\n",
+      "  num += np.uint64(golden_ratio)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#################### PARTIAL RESULTS FOR 16 TRIALS: DATASKETCHES 
####################\n",
+      "                   0             1             2             3          
   4   \\\n",
+      "2        1.166001e-06  7.089984e-07  9.159994e-07  7.499984e-07  
8.339994e-07   \n",
+      "3        1.162500e-05  7.499984e-07  1.375000e-06  7.909985e-07  
1.249999e-06   \n",
+      "4        1.000000e-06  3.583002e-06  7.919989e-07  7.080016e-07  
7.909985e-07   \n",
+      "5        8.329989e-07  7.499984e-07  7.919989e-07  7.499984e-07  
7.500021e-07   \n",
+      "6        8.749994e-07  7.499984e-07  7.499984e-07  7.089984e-07  
7.500021e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.079980e-07  7.079980e-07  7.499984e-07  7.080016e-07  
7.089984e-07   \n",
+      "2097152  7.079980e-07  7.090021e-07  7.080016e-07  7.079980e-07  
7.079980e-07   \n",
+      "2493948  7.499984e-07  7.500021e-07  7.089984e-07  7.089984e-07  
7.079980e-07   \n",
+      "2965820  7.089984e-07  7.089984e-07  6.669979e-07  7.090021e-07  
7.500021e-07   \n",
+      "3526975  6.669979e-07  7.079980e-07  7.079980e-07  7.080016e-07  
6.669979e-07   \n",
+      "\n",
+      "                   5             6             7             8          
   9   \\\n",
+      "2        1.375000e-06  7.080016e-07  7.499984e-07  7.919989e-07  
7.500021e-07   \n",
+      "3        7.500021e-07  7.910021e-07  7.089984e-07  1.332999e-06  
7.499984e-07   \n",
+      "4        7.499984e-07  7.089984e-07  7.080016e-07  8.330026e-07  
2.666999e-06   \n",
+      "5        7.499984e-07  6.670016e-07  7.090021e-07  7.909985e-07  
7.079980e-07   \n",
+      "6        7.089984e-07  7.080016e-07  7.089984e-07  7.909985e-07  
7.499984e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.079980e-07  7.499984e-07  7.090021e-07  7.080016e-07  
7.090021e-07   \n",
+      "2097152  6.659975e-07  7.080016e-07  7.080016e-07  7.500021e-07  
7.090021e-07   \n",
+      "2493948  7.079980e-07  7.090021e-07  6.660011e-07  7.500021e-07  
7.499984e-07   \n",
+      "2965820  7.079980e-07  6.670016e-07  6.670016e-07  7.080016e-07  
7.089984e-07   \n",
+      "3526975  6.670016e-07  7.089984e-07  7.090021e-07  7.079980e-07  
6.660011e-07   \n",
+      "\n",
+      "                   10            11            12            13         
   14  \\\n",
+      "2        7.090021e-07  8.749994e-07  7.499984e-07  7.089984e-07  
7.080016e-07   \n",
+      "3        7.080016e-07  7.079980e-07  7.919989e-07  7.920025e-07  
7.500021e-07   \n",
+      "4        7.080016e-07  8.750030e-07  7.080016e-07  2.790999e-06  
7.079980e-07   \n",
+      "5        7.500021e-07  7.090021e-07  7.080016e-07  7.499984e-07  
7.079980e-07   \n",
+      "6        7.089984e-07  7.080016e-07  7.079980e-07  7.080016e-07  
7.080016e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.499984e-07  7.079980e-07  7.090021e-07  7.079980e-07  
7.089984e-07   \n",
+      "2097152  7.079980e-07  7.090021e-07  7.079980e-07  7.089984e-07  
7.090021e-07   \n",
+      "2493948  7.079980e-07  7.089984e-07  6.669979e-07  7.089984e-07  
7.080016e-07   \n",
+      "2965820  7.089984e-07  7.499984e-07  6.669979e-07  6.669979e-07  
7.080016e-07   \n",
+      "3526975  7.080016e-07  6.669979e-07  6.670016e-07  7.080016e-07  
7.499984e-07   \n",
+      "\n",
+      "                   15  \n",
+      "2        7.500021e-07  \n",
+      "3        7.089984e-07  \n",
+      "4        2.834000e-06  \n",
+      "5        7.499984e-07  \n",
+      "6        7.499984e-07  \n",
+      "...               ...  \n",
+      "1763487  7.079980e-07  \n",
+      "2097152  7.500021e-07  \n",
+      "2493948  6.670016e-07  \n",
+      "2965820  7.090021e-07  \n",
+      "3526975  6.670016e-07  \n",
+      "\n",
+      "[81 rows x 16 columns]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      
"/Users/charlied/dev/python-sketching-lib-comparison/datasketches-python/jupyter/comparison-to-datasketch/utils.py:27:
 RuntimeWarning: overflow encountered in scalar add\n",
+      "  num += np.uint64(golden_ratio)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#################### PARTIAL RESULTS FOR 32 TRIALS: DATASKETCHES 
####################\n",
+      "                   0             1             2             3          
   4   \\\n",
+      "2        1.166001e-06  7.089984e-07  9.159994e-07  7.499984e-07  
8.339994e-07   \n",
+      "3        1.162500e-05  7.499984e-07  1.375000e-06  7.909985e-07  
1.249999e-06   \n",
+      "4        1.000000e-06  3.583002e-06  7.919989e-07  7.080016e-07  
7.909985e-07   \n",
+      "5        8.329989e-07  7.499984e-07  7.919989e-07  7.499984e-07  
7.500021e-07   \n",
+      "6        8.749994e-07  7.499984e-07  7.499984e-07  7.089984e-07  
7.500021e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.079980e-07  7.079980e-07  7.499984e-07  7.080016e-07  
7.089984e-07   \n",
+      "2097152  7.079980e-07  7.090021e-07  7.080016e-07  7.079980e-07  
7.079980e-07   \n",
+      "2493948  7.499984e-07  7.500021e-07  7.089984e-07  7.089984e-07  
7.079980e-07   \n",
+      "2965820  7.089984e-07  7.089984e-07  6.669979e-07  7.090021e-07  
7.500021e-07   \n",
+      "3526975  6.669979e-07  7.079980e-07  7.079980e-07  7.080016e-07  
6.669979e-07   \n",
+      "\n",
+      "                   5             6             7             8          
   9   \\\n",
+      "2        1.375000e-06  7.080016e-07  7.499984e-07  7.919989e-07  
7.500021e-07   \n",
+      "3        7.500021e-07  7.910021e-07  7.089984e-07  1.332999e-06  
7.499984e-07   \n",
+      "4        7.499984e-07  7.089984e-07  7.080016e-07  8.330026e-07  
2.666999e-06   \n",
+      "5        7.499984e-07  6.670016e-07  7.090021e-07  7.909985e-07  
7.079980e-07   \n",
+      "6        7.089984e-07  7.080016e-07  7.089984e-07  7.909985e-07  
7.499984e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.079980e-07  7.499984e-07  7.090021e-07  7.080016e-07  
7.090021e-07   \n",
+      "2097152  6.659975e-07  7.080016e-07  7.080016e-07  7.500021e-07  
7.090021e-07   \n",
+      "2493948  7.079980e-07  7.090021e-07  6.660011e-07  7.500021e-07  
7.499984e-07   \n",
+      "2965820  7.079980e-07  6.670016e-07  6.670016e-07  7.080016e-07  
7.089984e-07   \n",
+      "3526975  6.670016e-07  7.089984e-07  7.090021e-07  7.079980e-07  
6.660011e-07   \n",
+      "\n",
+      "         ...            22            23            24            25  
\\\n",
+      "2        ...  7.500021e-07  7.919989e-07  7.080016e-07  7.079980e-07   
\n",
+      "3        ...  7.919989e-07  7.920025e-07  7.499984e-07  7.909985e-07   
\n",
+      "4        ...  7.500021e-07  7.500021e-07  7.080016e-07  7.089984e-07   
\n",
+      "5        ...  7.499984e-07  7.080016e-07  7.089984e-07  7.500021e-07   
\n",
+      "6        ...  7.499984e-07  7.089984e-07  7.079980e-07  7.080016e-07   
\n",
+      "...      ...           ...           ...           ...           ...   
\n",
+      "1763487  ...  3.292000e-06  7.499984e-07  7.080016e-07  7.089984e-07   
\n",
+      "2097152  ...  7.080016e-07  7.500021e-07  7.080016e-07  7.079980e-07   
\n",
+      "2493948  ...  7.089984e-07  7.089984e-07  6.670016e-07  7.090021e-07   
\n",
+      "2965820  ...  6.670016e-07  6.669979e-07  7.089984e-07  7.500021e-07   
\n",
+      "3526975  ...  7.080016e-07  7.080016e-07  6.670016e-07  7.500021e-07   
\n",
+      "\n",
+      "                   26            27            28            29         
   30  \\\n",
+      "2        7.499984e-07  7.500021e-07  7.499984e-07  7.089984e-07  
7.499984e-07   \n",
+      "3        7.500021e-07  7.499984e-07  7.920025e-07  7.079980e-07  
7.920025e-07   \n",
+      "4        7.079980e-07  7.080016e-07  7.500021e-07  7.500021e-07  
2.624998e-06   \n",
+      "5        7.499984e-07  7.500021e-07  7.500021e-07  8.329989e-07  
7.079980e-07   \n",
+      "6        7.079980e-07  7.089984e-07  7.500021e-07  7.499984e-07  
7.080016e-07   \n",
+      "...               ...           ...           ...           ...         
  ...   \n",
+      "1763487  7.919989e-07  7.079980e-07  7.090021e-07  7.079980e-07  
7.089984e-07   \n",
+      "2097152  7.080016e-07  7.079980e-07  7.499984e-07  7.089984e-07  
6.669979e-07   \n",
+      "2493948  7.080016e-07  7.079980e-07  7.079980e-07  7.499984e-07  
6.670016e-07   \n",
+      "2965820  7.079980e-07  6.669979e-07  7.079980e-07  6.670016e-07  
7.079980e-07   \n",
+      "3526975  7.080016e-07  7.090021e-07  7.080016e-07  7.089984e-07  
7.090021e-07   \n",
+      "\n",
+      "                   31  \n",
+      "2        7.080016e-07  \n",
+      "3        7.500021e-07  \n",
+      "4        7.080016e-07  \n",
+      "5        7.499984e-07  \n",
+      "6        7.910021e-07  \n",
+      "...               ...  \n",
+      "1763487  6.660011e-07  \n",
+      "2097152  7.500021e-07  \n",
+      "2493948  7.090021e-07  \n",
+      "2965820  6.669979e-07  \n",
+      "3526975  7.079980e-07  \n",
+      "\n",
+      "[81 rows x 32 columns]\n",
+      "CPU times: user 8min 58s, sys: 1.78 s, total: 9min\n",
+      "Wall time: 9min 1s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "update_experiment.run()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "54db1c44-bc9a-4b02-93da-c96190669bf3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "asf8 = 
pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_8_bit.csv\",
 index_col=0)\n",
+    "asf6 = 
pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_6_bit.csv\",
 index_col=0)\n",
+    "asf4 = 
pd.read_csv(f\"hll_update_time_profile_/DataSketches_hll{update_experiment.file_extension}trials_32_4_bit.csv\",
 index_col=0)\n",
+    "dsk = 
pd.read_csv(f\"hll_update_time_profile_/datasketch_hll{update_experiment.file_extension}trials_32.csv\",
 index_col=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "39a1070c-5b00-43a4-85d2-76711aa99084",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Plotting parameters\n",
+    "method_plot_params = {\n",
+    "    \"asf8\" : {\"color\": \"C0\", \"marker\": '.'},\n",
+    "    \"asf6\" : {\"color\": \"C2\", \"marker\": '+'},\n",
+    "    \"asf4\" : {\"color\": \"C3\", \"marker\": 'd'},\n",
+    "    \"datasketch\" : {\"color\": \"C1\", \"marker\": \"^\"}\n",
+    "}\n",
+    "asf8_color = method_plot_params[\"asf8\"][\"color\"]\n",
+    "asf6_color = method_plot_params[\"asf6\"][\"color\"]\n",
+    "asf4_color = method_plot_params[\"asf4\"][\"color\"]\n",
+    "ds_color = method_plot_params[\"datasketch\"][\"color\"]\n",
+    "q90_ls = \"--\"\n",
+    "\n",
+    "params = {'legend.fontsize': 'x-large',\n",
+    "     'axes.labelsize': 'x-large',\n",
+    "     'axes.titlesize':'x-large',\n",
+    "     'xtick.labelsize':'x-large',\n",
+    "     'ytick.labelsize':'x-large',\n",
+    "      \"lines.linewidth\": 2.5}\n",
+    "plt.rcParams.update(params)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c340f56-6301-4fcf-826e-c3962c93683d",
+   "metadata": {},
+   "source": [
+    "We plot on the $y$-axis the median update time (in microseconds) and 
$x$-axis the input cardinality on a $\\log_{10}$ scale.\n",
+    "As expected, for both libraries, we see that the update time for a single 
item is essentially constant as $n$ increases, barring a small number of minor 
blips."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "31f1125d-e5ea-41af-abf3-d99a8ffb87b7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(6e-07, 1.6e-06)"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAAA/gAAAHHCAYAAAALaPPGAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAC450lEQVR4nOzdd3hUVfoH8O+dkt4TSkhIoYTeBBHpRUFRQMQOQhR7QcTdtS1Ll9WfyoJlXUBpoqsLGoqIdESlQ2hSUwgJJZDeM+X+/rjMZO6UZFqSSfh+nicPmTt37pwhZ2bue8973iOIoiiCiIiIiIiIiBo0RX03gIiIiIiIiIhcxwCfiIiIiIiIqBFggE9ERERERETUCDDAJyIiIiIiImoEGOATERERERERNQIM8ImIiIiIiIgaAQb4RERERERERI0AA3wiIiIiIiKiRkBV3w0gIgIAjUYDnU5X380gIiIiFyiV
 [...]
+      "text/plain": [
+       "<Figure size 1200x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(12,4))\n",
+    "\n",
+    "methods = [\"ASF8\", \"ASF6\", \"ASF4\", \"datasketch\"]\n",
+    "\n",
+    "for i, (method, colour, df) in enumerate(zip(methods, [asf8_color, 
asf6_color, asf4_color, ds_color], [asf8, asf6, asf4, dsk])):\n",
+    "    xn = df.index \n",
+    "    median = df.median(axis=1)\n",
+    "    ax.plot(xn, median,\n",
+    "           color=colour, label=method+\": median\", alpha=0.5)\n",
+    "\n",
+    "ax.set_xscale('log', base=10)\n",
+    "ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),\n",
+    "          ncol=2, fancybox=True)\n",
+    "ax.grid()\n",
+    "ax.set_ylabel(\"Median Update time\")\n",
+    "ax.set_xlabel(r\"Input cardinality $n$\")\n",
+    "ax.set_ylim(0.6E-6, 1.6E-6)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d969c60-4df4-4ee6-a570-604a5a68e655",
+   "metadata": {},
+   "source": [
+    "We evaluate the factor speedup per update.  This is defined as the ratio 
of the total time per update required in using the datasketch library compred 
to the ASF implementation.  We see that the datasketch library generally takes 
twice as much time per update compared to the ASF implementation.  Since there 
is essentially no difference between the $4, 6, 8$ bit versions of ASF 
HyperLogLog, we only plot the $8$ bit comparison below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6d68deef-26a8-411c-b0b1-e2182e41b967",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Text(0.5, 0, 'Input cardinality $n$')"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": 
"iVBORw0KGgoAAAANSUhEUgAABAkAAAGECAYAAABUEXkgAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAAB/lUlEQVR4nO3deVxU5f4H8M9sDOuAuKIgiIqiZuaGoqhoWmmZpamlpmWaqeVyb/faKlpdr7f6WVaWZqnZZu6WKyluqLjvIgrijgvKDsMs5/cHMM4wCwMzAwf8vF8vXjDnnDnzHOY7MM93nuf7SARBEEBEREREREREDz1pVTeAiIiIiIiIiMSBSQIiIiIiIiIiAsAkAREREREREREVY5KAiIiIiIiIiAAwSUBERERERERExZgkICIiIiIiIiIATBIQERERERERUTEmCYiIiIiIiIgIACCv6gY8bPR6PW7cuAEfHx9I
 [...]
+      "text/plain": [
+       "<Figure size 1200x400 with 1 Axes>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "fig, ax = plt.subplots(figsize=(12,4))\n",
+    "\n",
+    "methods = [\"ASF8\", \"ASF6\", \"ASF4\", \"datasketch\"]\n",
+    "\n",
+    "\n",
+    "xn = asf8.index\n",
+    "xspeedup = (dsk / asf8).median(axis=1)\n",
+    "ax.plot(xn, xspeedup)\n",
+    "\n",
+    "ax.set_xscale('log', base=10)\n",
+    "ax.grid()\n",
+    "ax.set_ylabel(r\"$\\times$ Speedup\")\n",
+    "ax.set_xlabel(r\"Input cardinality $n$\")\n",
+    "#ax.set_ylim(0.25E-6, 1.5E-6)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-python) 01/01: Added update time comparison

Reply via email to