In [6]:
from datasketches import frequent_strings_sketch
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
for lg_max_k in (4, 8, 32, 64):
    sketch = frequent_strings_sketch(lg_max_k)
    batch_size = 1000
    num_batches = 100
    records = []
    for ibatch in tqdm(list(range(num_batches))):
        x = np.random.randn(batch_size)
        for xi in x:
            sketch.update(str(xi))
        records.append({
            'num_records': (ibatch + 1)*batch_size,
            'size_bytes': len(sketch.serialize()),
        })
    s = pd.DataFrame(records).set_index('num_records', drop=True)
    plt.figure()
    s.plot(marker='.')
    plt.title(f'frequent_strings_sketch sketch size.  lg_max_k = {lg_max_k}')
    plt.show()
100%|██████████| 100/100 [00:00<00:00, 540.30it/s]
<Figure size 432x288 with 0 Axes>
100%|██████████| 100/100 [00:00<00:00, 604.61it/s]
<Figure size 432x288 with 0 Axes>
100%|██████████| 100/100 [00:00<00:00, 104.61it/s]
<Figure size 432x288 with 0 Axes>
100%|██████████| 100/100 [00:00<00:00, 107.06it/s]
<Figure size 432x288 with 0 Axes>
In [ ]: