This is an automated email from the ASF dual-hosted git repository.
charlie pushed a commit to branch lib-comparison
in repository https://gitbox.apache.org/repos/asf/datasketches-python.git
The following commit(s) were added to refs/heads/lib-comparison by this push:
new a54333d Added hll space comparison
a54333d is described below
commit a54333d86cde17910d3a79ca6d53c83a196a3144
Author: Charlie Dickens <[email protected]>
AuthorDate: Thu Nov 30 15:08:50 2023 +0000
Added hll space comparison
---
.../space-comparison.ipynb | 560 +++++++++++++++++++++
1 file changed, 560 insertions(+)
diff --git a/jupyter/comparison-to-datasketch/space-comparison.ipynb
b/jupyter/comparison-to-datasketch/space-comparison.ipynb
new file mode 100644
index 0000000..7ee2994
--- /dev/null
+++ b/jupyter/comparison-to-datasketch/space-comparison.ipynb
@@ -0,0 +1,560 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "472743a8-4868-4329-af35-50ab12b95662",
+ "metadata": {},
+ "source": [
+ "# Space Comparison \n",
+ "\n",
+ "We compare the space usage between the $4, 6, 8$ bit versions of the
Apache DataSketches (ASF) HyperLogLog implementation alongside the `datasketch`
HyperLogLogPlusPlus implementation. We show that the `datasketch` version has
approximately the same size as the ASF $8$ bit implementation when the latter
is in full estimation mode. However, smaller sketches can be made without
compromising on the accuracy if $6$ or $4$ bits per bucket are used which use
$75\\%$ and $50\\%$ of the s [...]
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "71570575-d16b-4b9b-a066-f921b58eb179",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from datetime import datetime\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "from utils import distinct_number_sequence\n",
+ "import datasketches as ds\n",
+ "import datasketch as d\n",
+ "import mmh3\n",
+ "import matplotlib.pyplot as plt\n",
+ "from timeit import default_timer\n",
+ "%matplotlib inline"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "82fac9dc-d5f2-4d19-9f18-8edc96d7c045",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class SpaceProfile:\n",
+ " \"\"\"Generates an experiment evaluating the update time for
different cardinality inputs\"\"\"\n",
+ " def __init__(self, sketch_lgk:int, lg_trials:int, max_lgN:int):\n",
+ " self.sketch_lgk = sketch_lgk\n",
+ " self.num_trials = 2**lg_trials\n",
+ " self.max_lgN = max_lgN\n",
+ " self.max_num_distincts = np.uint64(2 ** self.max_lgN)\n",
+ " self.directory_name = \"hll_space_profile_\"\n",
+ " if not os.path.exists(self.directory_name):\n",
+ " os.mkdir(self.directory_name)\n",
+ " self.file_extension = \"_\" + datetime.today().strftime('%H%M') +
f\"lgK_{self.sketch_lgk}_lgT_{lg_trials}\"\n",
+ "\n",
+ " # Need to remove repeated items for the program logic in
self.run()\n",
+ " self.plot_points = self._generate_plot_points()\n",
+ " self.plot_points.extend(self._generate_plot_points())\n",
+ " self.plot_points = list(set(self.plot_points))\n",
+ " self.plot_points.sort()\n",
+ " print(f\"Testing {len(self.plot_points)} points with
{self.num_trials} trials for average update times:\")\n",
+ " print(self.plot_points)\n",
+ "\n",
+ " # Initialise the data structures for results; one for each method
tested 8, 6, datasketch\n",
+ " # np.ndarrays\n",
+ " self.DataSketches_results_arr = np.zeros((len(self.plot_points),
self.num_trials), dtype=float)\n",
+ " self.DataSketches_results_arr_6 =
np.zeros_like(self.DataSketches_results_arr)\n",
+ " self.DataSketches_results_arr_4 =
np.zeros_like(self.DataSketches_results_arr)\n",
+ " self.datasketch_results_arr =
np.zeros_like(self.DataSketches_results_arr)\n",
+ "\n",
+ " # pd.DataFrames\n",
+ " self.DataSketches_results_df =
pd.DataFrame(index=self.plot_points, columns=None)\n",
+ " self.DataSketches_results_df_6 =
pd.DataFrame(index=self.plot_points, columns=None)\n",
+ " self.DataSketches_results_df_4 =
pd.DataFrame(index=self.plot_points, columns=None)\n",
+ " self.datasketch_results_df = pd.DataFrame(index=self.plot_points,
columns=None)\n",
+ " \n",
+ " def _generate_plot_points(self) -> list:\n",
+ " \"\"\"\n",
+ " Generates the standard sequence defining the input cardinalites
for the experiment\n",
+ " This is just two points at each power of 2\n",
+ " \"\"\"\n",
+ " all_plot_points = []\n",
+ " for lgk in range(1, self.max_lgN+1):\n",
+ " points = np.unique(np.logspace(start=lgk, stop=lgk+1, num=4,
endpoint=False, base=2, dtype=np.uint64))\n",
+ " all_plot_points.extend(points)\n",
+ " all_plot_points.sort()\n",
+ " return all_plot_points\n",
+ "\n",
+ " def _is_power_of_two(self, a:np.uint64) -> bool:\n",
+ " \"\"\"Bitwise operations to check value a is a power of
two\"\"\"\n",
+ " return (a & (a-1) == 0) and a != 0\n",
+ "\n",
+ " def run(self) -> None:\n",
+ " \"\"\"Runs the experiment and writes the files every power of two
trials.\"\"\"\n",
+ " seq_start = np.uint64(2345234)\n",
+ " distinct_number = np.uint64(3462)\n",
+ " previous_log_trial_index = 0\n",
+ " ds_all_results = np.zeros((self.num_trials,
len(self.plot_points)))\n",
+ " ds_all_results_6 = np.zeros_like(ds_all_results)\n",
+ " ds_all_results_4 = np.zeros_like(ds_all_results)\n",
+ " d_all_results = np.zeros_like(ds_all_results)\n",
+ "\n",
+ " for trial in range(1, self.num_trials+1):\n",
+ "\n",
+ " # Initialise the sketches\n",
+ " hll = ds.hll_sketch(self.sketch_lgk, ds.HLL_8)\n",
+ " hll6 = ds.hll_sketch(self.sketch_lgk, ds.HLL_6)\n",
+ " hll4 = ds.hll_sketch(self.sketch_lgk, ds.HLL_4)\n",
+ " h = d.HyperLogLogPlusPlus(p=self.sketch_lgk, hashfunc=lambda
x: mmh3.hash64(x, signed=False)[0])\n",
+ " plot_point_index = 0 # Return to the start of the plot
points list to generate the data\n",
+ " plot_point_value = self.plot_points[plot_point_index]\n",
+ " total_updates = 0\n",
+ " seq_start += distinct_number # Start a new input sequence\n",
+ "\n",
+ " # Temporary result data structure\n",
+ " ds_results = np.zeros((len(self.plot_points),))\n",
+ " ds_results_6 = np.zeros_like(ds_results)\n",
+ " ds_results_4 = np.zeros_like(ds_results)\n",
+ " d_results = np.zeros_like(ds_results)\n",
+ "\n",
+ "\n",
+ " for new_number in distinct_number_sequence(seq_start):\n",
+ " d_input = new_number.tobytes()\n",
+ " \n",
+ " hll.update(d_input)\n",
+ " hll6.update(d_input)\n",
+ " hll4.update(d_input)\n",
+ " h.update(d_input)\n",
+ " total_updates += 1\n",
+ "\n",
+ " if total_updates == plot_point_value:\n",
+ " ds_results[plot_point_index] =
hll.get_compact_serialization_bytes()\n",
+ " ds_results_6[plot_point_index] =
hll6.get_compact_serialization_bytes()\n",
+ " ds_results_4[plot_point_index] =
hll4.get_compact_serialization_bytes()\n",
+ " d_results[plot_point_index] = h.bytesize()\n",
+ " plot_point_index += 1\n",
+ "\n",
+ " if plot_point_index < len(self.plot_points):\n",
+ " plot_point_value =
self.plot_points[plot_point_index]\n",
+ " else:\n",
+ " break\n",
+ "\n",
+ " # After the break statement, control returns here. Now need
to decide whether to write or continue.\n",
+ " # subtract 1 as we use 1-based indexing for the trial
count.\n",
+ " ds_all_results[trial-1, :] = ds_results \n",
+ " ds_all_results_6[trial-1, :] = ds_results_6 \n",
+ " ds_all_results_4[trial-1, :] = ds_results_4\n",
+ " d_all_results[trial - 1, :] = d_results \n",
+ " if self._is_power_of_two(trial) and trial > 1:\n",
+ " # write the array only a logarithmic number of times\n",
+ " temporary_ds_results = ds_all_results[0:trial, : ]\n",
+ " temporary_ds_results_6 = ds_all_results_6[0:trial, : ]\n",
+ " temporary_ds_results_4 = ds_all_results_4[0:trial, : ]\n",
+ " temporary_d_results = d_all_results[0:trial, :]\n",
+ " print(f\"#################### PARTIAL RESULTS FOR {trial}
TRIALS: DATASKETCHES ####################\")\n",
+ " previous_log_trial_index = trial\n",
+ "\n",
+ " # Write 8 bit results\n",
+ " self.DataSketches_results_df =
pd.DataFrame(temporary_ds_results.T, \n",
+ "
index=self.DataSketches_results_df.index, \n",
+ "
columns=np.arange(trial).tolist())\n",
+ " self.DataSketches_results_df.to_csv(\n",
+ " self.directory_name + \"/DataSketches_hll\" +
self.file_extension + f\"trials_{trial}_8_bit.csv\",\n",
+ " index_label=\"n\")\n",
+ "\n",
+ " # Write 6 bit results\n",
+ " self.DataSketches_results_df_6 =
pd.DataFrame(temporary_ds_results_6.T, \n",
+ "
index=self.DataSketches_results_df_6.index, \n",
+ "
columns=np.arange(trial).tolist())\n",
+ " self.DataSketches_results_df_6.to_csv(\n",
+ " self.directory_name + \"/DataSketches_hll\" +
self.file_extension + f\"trials_{trial}_6_bit.csv\",\n",
+ " index_label=\"n\")\n",
+ "\n",
+ " # Write 4 bit results\n",
+ " self.DataSketches_results_df_4 =
pd.DataFrame(temporary_ds_results_4.T, \n",
+ "
index=self.DataSketches_results_df_4.index, \n",
+ "
columns=np.arange(trial).tolist())\n",
+ " self.DataSketches_results_df_4.to_csv(\n",
+ " self.directory_name + \"/DataSketches_hll\" +
self.file_extension + f\"trials_{trial}_4_bit.csv\",\n",
+ " index_label=\"n\")\n",
+ "\n",
+ " # Write datasketch results\n",
+ " self.datasketch_results_df =
pd.DataFrame(temporary_d_results.T,\n",
+ "
index=self.datasketch_results_df.index,\n",
+ "
columns=np.arange(trial).tolist())\n",
+ " self.datasketch_results_df.to_csv(\n",
+ " self.directory_name + \"/datasketch_hll\" +
self.file_extension + f\"trials_{trial}.csv\",\n",
+ " index_label=\"n\"\n",
+ " )\n",
+ " print(self.DataSketches_results_df)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "01003378-cb60-45d0-9b19-a74b028dd4a1",
+ "metadata": {},
+ "source": [
+ "The experiment updates each of the four sketches with a number of
distinct values determined by the input parameters `MAX_LG_N`. A single pass
is taken for each trial and all four methods are updated in that pass. The
number of bytes required to write the sketch to memory is evaluated and written
to a results file.\n",
+ "\n",
+ "There is essentially no variation in the sketch sizes over these trials
so only a small number of trials are performed. We set the default
experimental parameters below. \n",
+ "\n",
+ "```SKETCH_LGK = 12, LG_TRIALS = 2, MAX_LG_N = 21``` "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "45e22a9a-4b30-4a2b-9a49-0336e4d27bd8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "SKETCH_LGK = 12\n",
+ "LG_TRIALS = 3\n",
+ "MAX_LG_N = 21"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "b748aead-1563-485f-adb5-378b1655bf38",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Testing 81 points with 8 trials for average update times:\n",
+ "[2, 3, 4, 5, 6, 8, 9, 11, 13, 16, 19, 22, 26, 32, 38, 45, 53, 64, 76,
90, 107, 128, 152, 181, 215, 256, 304, 362, 430, 512, 608, 724, 861, 1024,
1217, 1448, 1722, 2048, 2435, 2896, 3444, 4096, 4870, 5792, 6888, 8192, 9741,
11585, 13777, 16384, 19483, 23170, 27554, 32768, 38967, 46340, 55108, 65536,
77935, 92681, 110217, 131072, 155871, 185363, 220435, 262144, 311743, 370727,
440871, 524288, 623487, 741455, 881743, 1048576, 1246974, 1482910, 1763487,
2097152, 2493948, 2965820, 3526975]\n"
+ ]
+ }
+ ],
+ "source": [
+ "space_experiment = SpaceProfile(SKETCH_LGK, LG_TRIALS, MAX_LG_N)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "cc7b9b3b-7976-4fb9-adc7-d7691134df93",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "#################### PARTIAL RESULTS FOR 2 TRIALS: DATASKETCHES
####################\n",
+ " 0 1\n",
+ "2 16.0 16.0\n",
+ "3 20.0 20.0\n",
+ "4 24.0 24.0\n",
+ "5 28.0 28.0\n",
+ "6 32.0 32.0\n",
+ "... ... ...\n",
+ "1763487 4136.0 4136.0\n",
+ "2097152 4136.0 4136.0\n",
+ "2493948 4136.0 4136.0\n",
+ "2965820 4136.0 4136.0\n",
+ "3526975 4136.0 4136.0\n",
+ "\n",
+ "[81 rows x 2 columns]\n",
+ "#################### PARTIAL RESULTS FOR 4 TRIALS: DATASKETCHES
####################\n",
+ " 0 1 2 3\n",
+ "2 16.0 16.0 16.0 16.0\n",
+ "3 20.0 20.0 20.0 20.0\n",
+ "4 24.0 24.0 24.0 24.0\n",
+ "5 28.0 28.0 28.0 28.0\n",
+ "6 32.0 32.0 32.0 32.0\n",
+ "... ... ... ... ...\n",
+ "1763487 4136.0 4136.0 4136.0 4136.0\n",
+ "2097152 4136.0 4136.0 4136.0 4136.0\n",
+ "2493948 4136.0 4136.0 4136.0 4136.0\n",
+ "2965820 4136.0 4136.0 4136.0 4136.0\n",
+ "3526975 4136.0 4136.0 4136.0 4136.0\n",
+ "\n",
+ "[81 rows x 4 columns]\n",
+ "#################### PARTIAL RESULTS FOR 8 TRIALS: DATASKETCHES
####################\n",
+ " 0 1 2 3 4 5 6
7\n",
+ "2 16.0 16.0 16.0 16.0 16.0 16.0 16.0
16.0\n",
+ "3 20.0 20.0 20.0 20.0 20.0 20.0 20.0
20.0\n",
+ "4 24.0 24.0 24.0 24.0 24.0 24.0 24.0
24.0\n",
+ "5 28.0 28.0 28.0 28.0 28.0 28.0 28.0
28.0\n",
+ "6 32.0 32.0 32.0 32.0 32.0 32.0 32.0
32.0\n",
+ "... ... ... ... ... ... ... ...
...\n",
+ "1763487 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ "2097152 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ "2493948 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ "2965820 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ "3526975 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ "\n",
+ "[81 rows x 8 columns]\n",
+ "CPU times: user 2min 9s, sys: 229 ms, total: 2min 9s\n",
+ "Wall time: 2min 9s\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%time\n",
+ "space_experiment.run()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "8afd8986-9641-4cc9-ac4d-5ead4c359367",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "asf8 =
pd.read_csv(f\"hll_space_profile_/DataSketches_hll{space_experiment.file_extension}trials_{1<<LG_TRIALS}_8_bit.csv\",
index_col=0)\n",
+ "asf6 =
pd.read_csv(f\"hll_space_profile_/DataSketches_hll{space_experiment.file_extension}trials_{1<<LG_TRIALS}_6_bit.csv\",
index_col=0)\n",
+ "asf4 =
pd.read_csv(f\"hll_space_profile_/DataSketches_hll{space_experiment.file_extension}trials_{1<<LG_TRIALS}_4_bit.csv\",
index_col=0)\n",
+ "dsk =
pd.read_csv(f\"hll_space_profile_/datasketch_hll{space_experiment.file_extension}trials_{1<<LG_TRIALS}.csv\",
index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "85c00f74-7e5c-4aa5-81ef-6c39d61a49e1",
+ "metadata": {},
+ "source": [
+ "Next, we will plot the median number of bytes required to write the
sketch to memory for each method. There is no variation for the sizes of ASF8
and ASF6; similarly, for ASF4, although there is slight variation in the size
of the sketches in estimation mode. The `datasketch` implementation has no
variation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "702c160f-f5db-4c81-96ba-03935a48c9da",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "( 0 1 2 3 4 5 6 7\n",
+ " n \n",
+ " 2 16.0 16.0 16.0 16.0 16.0 16.0 16.0 16.0\n",
+ " 3 20.0 20.0 20.0 20.0 20.0 20.0 20.0 20.0\n",
+ " 4 24.0 24.0 24.0 24.0 24.0 24.0 24.0 24.0\n",
+ " 5 28.0 28.0 28.0 28.0 28.0 28.0 28.0 28.0\n",
+ " 6 32.0 32.0 32.0 32.0 32.0 32.0 32.0 32.0,\n",
+ " 0 1 2 3 4 5 6
7\n",
+ " n
\n",
+ " 1763487 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ " 2097152 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ " 2493948 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ " 2965820 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0\n",
+ " 3526975 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0 4136.0
4136.0)"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "asf8.head(), asf8.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "4764f965-4e40-4ff9-a99f-001f8fe7a7a9",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "( 0 1 2 3 4 5 6 7\n",
+ " n \n",
+ " 2 16.0 16.0 16.0 16.0 16.0 16.0 16.0 16.0\n",
+ " 3 20.0 20.0 20.0 20.0 20.0 20.0 20.0 20.0\n",
+ " 4 24.0 24.0 24.0 24.0 24.0 24.0 24.0 24.0\n",
+ " 5 28.0 28.0 28.0 28.0 28.0 28.0 28.0 28.0\n",
+ " 6 32.0 32.0 32.0 32.0 32.0 32.0 32.0 32.0,\n",
+ " 0 1 2 3 4 5 6
7\n",
+ " n
\n",
+ " 1763487 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0
3113.0\n",
+ " 2097152 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0
3113.0\n",
+ " 2493948 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0
3113.0\n",
+ " 2965820 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0
3113.0\n",
+ " 3526975 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0 3113.0
3113.0)"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "asf6.head(), asf6.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "d8513334-4c91-450d-ad97-12418bc8c5d6",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "( 0 1 2 3 4 5 6 7\n",
+ " n \n",
+ " 2 16.0 16.0 16.0 16.0 16.0 16.0 16.0 16.0\n",
+ " 3 20.0 20.0 20.0 20.0 20.0 20.0 20.0 20.0\n",
+ " 4 24.0 24.0 24.0 24.0 24.0 24.0 24.0 24.0\n",
+ " 5 28.0 28.0 28.0 28.0 28.0 28.0 28.0 28.0\n",
+ " 6 32.0 32.0 32.0 32.0 32.0 32.0 32.0 32.0,\n",
+ " 0 1 2 3 4 5 6
7\n",
+ " n
\n",
+ " 1763487 2092.0 2088.0 2096.0 2096.0 2092.0 2100.0 2092.0
2096.0\n",
+ " 2097152 2092.0 2092.0 2104.0 2096.0 2096.0 2100.0 2092.0
2088.0\n",
+ " 2493948 2092.0 2092.0 2104.0 2088.0 2100.0 2104.0 2096.0
2088.0\n",
+ " 2965820 2092.0 2104.0 2088.0 2088.0 2092.0 2100.0 2096.0
2088.0\n",
+ " 3526975 2092.0 2104.0 2088.0 2088.0 2092.0 2100.0 2096.0
2088.0)"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "asf4.head(), asf4.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "a0b4bf36-8fec-4007-8a67-dda92ba49234",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "( 0 1 2 3 4 5 6 7\n",
+ " n \n",
+ " 2 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0\n",
+ " 3 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0\n",
+ " 4 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0\n",
+ " 5 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0\n",
+ " 6 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0,\n",
+ " 0 1 2 3 4 5 6
7\n",
+ " n
\n",
+ " 1763487 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0
4097.0\n",
+ " 2097152 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0
4097.0\n",
+ " 2493948 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0
4097.0\n",
+ " 2965820 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0
4097.0\n",
+ " 3526975 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0 4097.0
4097.0)"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dsk.head(), dsk.tail()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "0bd8f65e-ff85-4857-a79a-87687128adff",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Plotting parameters\n",
+ "method_plot_params = {\n",
+ " \"asf8\" : {\"color\": \"C0\", \"marker\": '.'},\n",
+ " \"asf6\" : {\"color\": \"C2\", \"marker\": '+'},\n",
+ " \"asf4\" : {\"color\": \"C3\", \"marker\": 'd'},\n",
+ " \"datasketch\" : {\"color\": \"C1\", \"marker\": \"^\"}\n",
+ "}\n",
+ "asf8_color = method_plot_params[\"asf8\"][\"color\"]\n",
+ "asf6_color = method_plot_params[\"asf6\"][\"color\"]\n",
+ "asf4_color = method_plot_params[\"asf4\"][\"color\"]\n",
+ "ds_color = method_plot_params[\"datasketch\"][\"color\"]\n",
+ "q90_ls = \"--\"\n",
+ "\n",
+ "params = {'legend.fontsize': 'x-large',\n",
+ " 'axes.labelsize': 'x-large',\n",
+ " 'axes.titlesize':'x-large',\n",
+ " 'xtick.labelsize':'x-large',\n",
+ " 'ytick.labelsize':'x-large',\n",
+ " \"lines.linewidth\": 2.5}\n",
+ "plt.rcParams.update(params)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "cb2980aa-c47c-40c1-a69f-ff8271cbcb15",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'Input cardinality $n$')"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png":
"iVBORw0KGgoAAAANSUhEUgAAA+UAAAHHCAYAAADZOPmeAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/SrBM8AAAACXBIWXMAAA9hAAAPYQGoP6dpAACCNUlEQVR4nO3dd3xTdfv/8XeSpovSxSx7D9kb2QiyVFBBURFBcTAERL1/rlspKvDV763cgHor8hVwcauALJE9BAWUpYIgWzZIoQvaNE3O74/S0NC0tKFt2vJ6Ph59QM75nJPrtFfSXP2MYzIMwxAAAAAAAChwZl8HAAAAAADAzYqiHAAAAAAAH6EoBwAAAADARyjKAQAAAADwEYpyAAAAAAB8hKIcAAAAAAAfoSgHAAAAAMBHKMoBAAAAAPARP18HAKDostvtcjgcvg4DAADcAIvFIqvV6uswgJsWRTmAXIuPj9f5
[...]
+ "text/plain": [
+ "<Figure size 1200x400 with 1 Axes>"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig, ax = plt.subplots(figsize=(12,4))\n",
+ "\n",
+ "methods = [\"ASF8\", \"ASF6\", \"ASF4\", \"datasketch\"]\n",
+ "\n",
+ "for i, (method, colour, df) in enumerate(zip(methods, [asf8_color,
asf6_color, asf4_color, ds_color], [asf8, asf6, asf4, dsk])):\n",
+ " xn = df.index \n",
+ " median = df.median(axis=1)\n",
+ " ax.plot(xn, median / 1024,\n",
+ " color=colour, label=method+\": median\", alpha=0.5)\n",
+ "\n",
+ "ax.set_xscale('log', base=10)\n",
+ "ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.25),\n",
+ " ncol=2, fancybox=True)\n",
+ "ax.grid()\n",
+ "ax.set_ylabel(\"Space (kilobytes)\")\n",
+ "ax.set_xlabel(r\"Input cardinality $n$\")\n",
+ "# ax.set_ylim(0.6E-6, 1.6E-6)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "46851818-f89e-4f02-9968-beb0b3594909",
+ "metadata": {},
+ "source": [
+ "Each of the ASF methods go through some resizing and growth stages until
a maximum size is reached. At this point the sketches grow no further. As
expected, the $4$ bit version is half the size of the $8$ bit version and the
$6$ bit version is $3/4$ the size of the $8$ bit version. Once the $8$ bit ASF
HLL enters its final stage, it is the same size as the datasketch HLL, but can
be much smaller prior to this stage."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.4"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]