from datasketches import frequent_strings_sketch
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
for lg_max_k in (4, 8, 32, 64):
sketch = frequent_strings_sketch(lg_max_k)
batch_size = 1000
num_batches = 100
records = []
for ibatch in tqdm(list(range(num_batches))):
x = np.random.randn(batch_size)
for xi in x:
sketch.update(str(xi))
records.append({
'num_records': (ibatch + 1)*batch_size,
'size_bytes': len(sketch.serialize()),
})
s = pd.DataFrame(records).set_index('num_records', drop=True)
plt.figure()
s.plot(marker='.')
plt.title(f'frequent_strings_sketch sketch size. lg_max_k = {lg_max_k}')
plt.show()