Examples#
Note: Make sure the nltk dependencies are installed. If not, please run the following command:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
# uncomment the below line if running in Colab
# package neeeds to be installed for the notebook to run
# ! pip install -U stream_topic
import warnings
warnings.filterwarnings("ignore")
from stream_topic.models import KmeansTM
from stream_topic.utils import TMDataset
Optimize model parameters via bayesian optimization
dataset = TMDataset()
dataset.fetch_dataset("BBC_News")
dataset.preprocess(model_type="KmeansTM")
2024-08-09 15:33:16.644 | INFO | stream_topic.utils.dataset:fetch_dataset:118 - Fetching dataset: BBC_News
2024-08-09 15:33:17.193 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:331 - Downloading dataset from github
2024-08-09 15:33:17.848 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:333 - Dataset downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:18.133 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:361 - Downloading dataset info from github
2024-08-09 15:33:18.324 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:363 - Dataset info downloaded successfully at ~/stream_topic_data/
Preprocessing documents: 100%|██████████| 2225/2225 [00:11<00:00, 198.41it/s]
model = KmeansTM()
output = model.optimize_and_fit(dataset, n_trials=10, max_topics=20, min_topics=3)
[I 2024-08-09 15:33:29,603] A new study created in memory with name: no-name-882315ac-44ed-4d90-9fc1-cff18636e26d
2024-08-09 15:33:29.606 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:30.201 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:30.285 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:31.073 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:31.083 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-08-09 15:33:35.407 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:36.016 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:36,017] Trial 0 finished with value: -2463.7082266615807 and parameters: {'n_topics': 14, 'n_neighbors': 12, 'n_components': 6, 'metric': 'euclidean', 'init': 'random', 'n_init': 23, 'max_iter': 174}. Best is trial 0 with value: -2463.7082266615807.
2024-08-09 15:33:36.018 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:36.102 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:36.178 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:36.746 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:36.746 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:33:40.089 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:40.457 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:40,458] Trial 1 finished with value: -2946.160921364957 and parameters: {'n_topics': 19, 'n_neighbors': 36, 'n_components': 27, 'metric': 'cosine', 'init': 'random', 'n_init': 26, 'max_iter': 766}. Best is trial 1 with value: -2946.160921364957.
2024-08-09 15:33:40.459 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:40.554 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:40.644 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:40.978 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:40.978 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:33:43.766 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:44.160 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:44,161] Trial 2 finished with value: -3400.3953215739325 and parameters: {'n_topics': 7, 'n_neighbors': 48, 'n_components': 48, 'metric': 'euclidean', 'init': 'random', 'n_init': 29, 'max_iter': 231}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:33:44.162 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:44.242 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:44.325 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:44.683 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:44.684 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:33:47.631 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:47.997 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:47,998] Trial 3 finished with value: -2834.7694686925297 and parameters: {'n_topics': 13, 'n_neighbors': 26, 'n_components': 12, 'metric': 'cosine', 'init': 'random', 'n_init': 25, 'max_iter': 379}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:33:48.000 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:48.096 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:48.170 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:48.445 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:48.445 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:33:51.185 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:52.744 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:52,745] Trial 4 finished with value: -3160.985634056173 and parameters: {'n_topics': 12, 'n_neighbors': 28, 'n_components': 38, 'metric': 'euclidean', 'init': 'k-means++', 'n_init': 24, 'max_iter': 547}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:33:52.746 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:52.841 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:52.917 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:53.191 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:53.193 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:33:56.358 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:33:56.905 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:33:56,906] Trial 5 finished with value: -2794.15342912206 and parameters: {'n_topics': 14, 'n_neighbors': 22, 'n_components': 10, 'metric': 'cosine', 'init': 'k-means++', 'n_init': 13, 'max_iter': 776}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:33:56.908 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:33:57.004 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:33:57.090 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:33:57.806 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:33:57.807 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:34:00.880 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:34:01.233 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:34:01,234] Trial 6 finished with value: -3059.2387453702095 and parameters: {'n_topics': 15, 'n_neighbors': 37, 'n_components': 20, 'metric': 'cosine', 'init': 'random', 'n_init': 20, 'max_iter': 976}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:34:01.236 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:34:01.316 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:34:01.408 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:34:01.673 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:34:01.673 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:34:04.729 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:34:05.132 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:34:05,134] Trial 7 finished with value: -3027.354335724434 and parameters: {'n_topics': 18, 'n_neighbors': 37, 'n_components': 24, 'metric': 'cosine', 'init': 'random', 'n_init': 30, 'max_iter': 728}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:34:05.135 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:34:05.210 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:34:05.284 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:34:05.681 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:34:05.682 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:34:08.518 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:34:08.838 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:34:08,838] Trial 8 finished with value: -2060.2771839726806 and parameters: {'n_topics': 3, 'n_neighbors': 12, 'n_components': 18, 'metric': 'cosine', 'init': 'random', 'n_init': 15, 'max_iter': 792}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:34:08.840 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:34:08.927 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:34:09.008 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:34:09.634 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:34:09.635 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:34:12.202 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:34:12.564 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
[I 2024-08-09 15:34:12,565] Trial 9 finished with value: -2582.634045394511 and parameters: {'n_topics': 16, 'n_neighbors': 16, 'n_components': 21, 'metric': 'euclidean', 'init': 'random', 'n_init': 23, 'max_iter': 608}. Best is trial 2 with value: -3400.3953215739325.
2024-08-09 15:34:12.566 | INFO | stream_topic.models.abstract_helper_models.base:optimize_hyperparameters:389 - Optimal parameters: {'n_neighbors': 48, 'n_components': 48, 'metric': 'euclidean', 'init': 'random', 'n_init': 29, 'max_iter': 231} with 7 topics based on AIC.
2024-08-09 15:34:12.567 | INFO | stream_topic.models.KmeansTM:fit:206 - --- Training KmeansTM topic model ---
2024-08-09 15:34:12.664 | INFO | stream_topic.models.abstract_helper_models.base:prepare_embeddings:215 - --- Loading precomputed paraphrase-MiniLM-L3-v2 embeddings ---
2024-08-09 15:34:12.735 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:302 - Downloading embeddings from github
2024-08-09 15:34:13.155 | INFO | stream_topic.utils.data_downloader:load_custom_dataset_from_url:304 - Embeddings downloaded successfully at ~/stream_topic_data/
2024-08-09 15:34:13.156 | INFO | stream_topic.models.abstract_helper_models.base:dim_reduction:196 - --- Reducing dimensions ---
2024-08-09 15:34:15.585 | INFO | stream_topic.models.KmeansTM:_clustering:155 - --- Creating document cluster ---
2024-08-09 15:34:15.897 | INFO | stream_topic.models.KmeansTM:fit:240 - --- Training completed successfully. ---
topics = model.get_topics()
print(len(topics))
7
Evaluate#
from stream_topic.metrics import NPMI, ISIM
metric = NPMI(dataset)
metric.score(topics)
0.19318
isim_metric = ISIM()
isim_metric.score(topics)
0.18481285870075226