from sklearn.datasets import load_files
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sklearn.cluster import MiniBatchKMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram

from sklearn import metrics

from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

# for reproducibility
random_state = 321

# Load the dataset, handling encoding errors gracefully
data = load_files('data/bbcsport-fulltext/bbcsport', encoding='utf-8', decode_error='replace')

# Convert the data into a pandas DataFrame
df = pd.DataFrame(list(zip(data['data'], data['target'])), columns=['text', 'label'])

# Display the first few rows
print(df.head())

                                                text  label
0  England victory tainted by history\n\nAs Engla...      1
1  Australia complete sweep\n\nThird Test, Sydney...      1
2  UK Athletics agrees new kit deal\n\nUK Athleti...      0
3  Bekele sets sights on world mark\n\nOlympic 10...      0
4  Captains lining up for Aid match\n\nIreland's ...      3

labels, counts = np.unique(df['label'], return_counts=True)
print(dict(zip(data.target_names, counts)))

{'athletics': np.int64(101), 'cricket': np.int64(124), 'football': np.int64(265), 'rugby': np.int64(147), 'tennis': np.int64(100)}

bbcsport_text = pd.DataFrame(df['text'])
bbcsport_text.head()

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   lowercase=True,
                                   min_df=2,
                                   ngram_range=(1,2),
                                   token_pattern=r'(?u)\b[A-Za-z][A-Za-z]+\b')
tfidf_vectorizer.fit(df.text.values)
tfidf_matrix = tfidf_vectorizer.transform(df.text.values)
tfidf_matrix.shape

(737, 21604)

# You can check the vocabulary of your vectorizer with the following line
# tfidf_vectorizer.vocabulary_

n_clusters = 5
cls = MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state, n_init = 2)
cls.fit(tfidf_matrix)

MiniBatchKMeans(n_clusters=5, n_init=2, random_state=321)

print("Top terms per cluster:")

order_centroids = cls.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out()
for i in range(n_clusters):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s/' % terms[ind], end='')
    print()
# https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html

Top terms per cluster:
Cluster 0: olympic/ race/ indoor/ athens/ holmes/ kenteris/ world/ iaaf/ champion/ greek/
Cluster 1: chelsea/ club/ said/ league/ united/ arsenal/ liverpool/ football/ players/ cup/
Cluster 2: cricket/ test/ pakistan/ series/ day/ england/ india/ south/ australia/ south africa/
Cluster 3: england/ wales/ ireland/ half/ rugby/ france/ robinson/ nations/ game/ italy/
Cluster 4: open/ seed/ roddick/ australian/ set/ federer/ australian open/ hewitt/ final/ said/

# reduce the features to 2D
pca = PCA(n_components=2, random_state=random_state)
reduced_features = pca.fit_transform(tfidf_matrix.toarray())

# reduce the cluster centers to 2D
reduced_cluster_centers = pca.transform(cls.cluster_centers_)

plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(tfidf_matrix))
plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')

<matplotlib.collections.PathCollection at 0x11a275f90>

print("Homogeneity: %0.3f" % metrics.homogeneity_score(data.target, cls.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(data.target, cls.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(data.target, cls.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(data.target, cls.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(tfidf_matrix, cls.labels_, sample_size=1000))

Homogeneity: 0.776
Completeness: 0.785
V-measure: 0.780
Adjusted Rand-Index: 0.767
Silhouette Coefficient: 0.016

num_clus = [x for x in range(3, 7)]
squared_errors = []
for cluster in num_clus:
    cls = MiniBatchKMeans(n_clusters=cluster, random_state=random_state, n_init = 2)
    cls.fit(tfidf_matrix) # Train Clusters
    squared_errors.append(cls.inertia_) # Appending the squared loss obtained in the list

# Choosing the best cluster using Elbow Method.
# source credit,few parts of min squred loss info is taken from different parts of the stakoverflow answers.
# this is used to understand to find the optimal clusters in different way rather than used in BOW, TFIDF

num_clus = [x for x in range(2, 7)]
squared_errors = []
for cluster in num_clus:
    cls = MiniBatchKMeans(n_clusters=cluster, random_state=random_state, n_init = 2)
    cls.fit(tfidf_matrix) # Train Clusters
    squared_errors.append(cls.inertia_) # Appending the squared loss obtained in the list

optimal_clusters = np.argmin(squared_errors) + 2 # As argmin return the index of minimum loss.
plt.plot(num_clus, squared_errors)
plt.title("Elbow Curve to find the no. of clusters.")
plt.xlabel("Number of clusters.")
plt.ylabel("Squared Loss.")
xy = (optimal_clusters, min(squared_errors))
plt.annotate('(%s, %s)' % xy, xy = xy, textcoords='data')
plt.show()

print ("The optimal number of clusters obtained is - ", optimal_clusters)
print ("The loss for optimal cluster is - ", min(squared_errors))

The optimal number of clusters obtained is -  6
The loss for optimal cluster is -  691.5376630546459

documents = ['Frank de Boer out as Oranje manager after early Euro 2020 exit Dutch men’s football team coach.',
             'The time has come for Nadal to be selective in the events that he should and should not play. This is where he can start the difficulty. After a rigorous participation of the clay season, Rafael Nadal definitely wants to conserve his energies for as long as possible.']

n_clusters = 5
cls = MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state, n_init = 2)
cls.fit(tfidf_matrix)

MiniBatchKMeans(n_clusters=5, n_init=2, random_state=321)

tfidf_test = tfidf_vectorizer.transform(documents)
tfidf_test.shape

(2, 21604)

print(cls.predict(tfidf_test))

[1 1]

cls2 = AgglomerativeClustering(n_clusters=5, metric='euclidean', linkage='ward')
cls2 = cls2.fit(tfidf_matrix.toarray())

cls2.labels_

array([1, 1, 0, 0, 4, 2, 0, 0, 2, 0, 0, 2, 0, 1, 1, 1, 4, 2, 2, 3, 2, 0,
       2, 1, 2, 2, 1, 2, 4, 0, 0, 0, 0, 1, 2, 2, 2, 4, 0, 2, 2, 4, 2, 2,
       2, 4, 4, 2, 2, 0, 2, 4, 2, 2, 2, 2, 2, 0, 4, 4, 2, 2, 4, 2, 2, 2,
       4, 4, 2, 2, 2, 4, 2, 4, 1, 0, 0, 4, 2, 2, 1, 0, 1, 3, 2, 4, 2, 4,
       2, 1, 0, 0, 2, 1, 4, 0, 2, 4, 1, 2, 4, 0, 0, 2, 2, 4, 0, 2, 1, 2,
       1, 2, 2, 3, 4, 1, 0, 1, 2, 2, 0, 1, 2, 2, 2, 2, 0, 2, 2, 0, 0, 4,
       0, 2, 2, 2, 1, 2, 2, 4, 2, 1, 1, 0, 2, 2, 2, 0, 2, 2, 4, 2, 4, 2,
       0, 2, 4, 2, 2, 4, 0, 0, 0, 4, 2, 2, 0, 0, 2, 2, 1, 4, 4, 1, 1, 2,
       2, 1, 2, 0, 4, 2, 2, 4, 4, 2, 0, 2, 2, 2, 1, 2, 1, 2, 0, 2, 4, 2,
       0, 2, 2, 0, 2, 2, 1, 1, 2, 2, 2, 4, 2, 0, 2, 2, 0, 1, 2, 2, 4, 1,
       2, 4, 0, 1, 2, 2, 2, 2, 4, 1, 2, 2, 2, 3, 2, 0, 0, 2, 4, 4, 2, 0,
       2, 0, 2, 0, 3, 2, 2, 0, 4, 2, 0, 4, 2, 4, 4, 1, 2, 2, 2, 2, 2, 2,
       2, 1, 0, 2, 1, 2, 1, 4, 2, 1, 2, 1, 2, 1, 1, 0, 4, 1, 2, 4, 2, 2,
       0, 0, 1, 0, 3, 1, 4, 2, 1, 1, 2, 1, 2, 1, 1, 1, 4, 2, 2, 0, 2, 1,
       2, 4, 0, 4, 1, 0, 2, 4, 0, 2, 1, 2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 2,
       4, 4, 0, 2, 2, 2, 4, 1, 0, 0, 0, 0, 2, 1, 2, 4, 2, 2, 4, 1, 2, 2,
       2, 0, 2, 0, 1, 4, 2, 0, 1, 0, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2,
       2, 2, 2, 0, 4, 1, 4, 2, 1, 2, 2, 4, 2, 2, 2, 2, 4, 1, 1, 1, 2, 0,
       2, 2, 0, 2, 2, 2, 2, 2, 1, 0, 4, 4, 2, 2, 2, 2, 1, 2, 0, 2, 2, 0,
       4, 2, 2, 0, 0, 1, 0, 1, 1, 2, 0, 3, 0, 2, 2, 2, 2, 0, 2, 2, 1, 0,
       2, 2, 2, 1, 3, 0, 0, 2, 0, 0, 1, 0, 2, 2, 2, 4, 2, 2, 2, 1, 2, 1,
       4, 1, 1, 2, 3, 0, 2, 0, 1, 4, 0, 2, 2, 0, 2, 4, 0, 0, 2, 0, 4, 2,
       4, 2, 1, 1, 2, 1, 2, 0, 4, 4, 0, 2, 2, 2, 0, 4, 0, 2, 0, 0, 2, 1,
       0, 1, 2, 2, 0, 0, 1, 4, 4, 1, 1, 2, 2, 0, 0, 4, 0, 1, 1, 2, 4, 2,
       0, 2, 0, 4, 2, 0, 3, 2, 2, 0, 0, 1, 2, 0, 2, 2, 0, 2, 4, 0, 2, 4,
       1, 4, 4, 2, 1, 1, 4, 0, 0, 0, 2, 0, 0, 2, 2, 1, 2, 1, 1, 1, 2, 2,
       2, 4, 4, 2, 2, 2, 2, 1, 4, 2, 2, 1, 2, 2, 1, 1, 1, 4, 2, 1, 2, 2,
       0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 4, 2, 2, 4, 0, 2, 1, 2, 0, 0, 2, 0,
       1, 3, 3, 2, 0, 2, 0, 0, 0, 0, 2, 2, 1, 1, 4, 1, 4, 0, 2, 1, 0, 0,
       4, 2, 4, 2, 1, 2, 4, 4, 2, 2, 2, 4, 1, 0, 0, 1, 4, 2, 4, 2, 2, 1,
       1, 1, 0, 3, 2, 1, 4, 4, 4, 4, 2, 0, 0, 2, 1, 2, 0, 2, 2, 0, 2, 0,
       4, 2, 2, 2, 2, 4, 3, 0, 1, 0, 1, 2, 4, 0, 1, 2, 4, 4, 0, 0, 2, 2,
       2, 1, 0, 1, 2, 2, 2, 0, 1, 0, 4, 0, 0, 1, 2, 4, 2, 0, 2, 2, 2, 2,
       0, 4, 2, 0, 0, 2, 0, 1, 4, 4, 0])

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_,
                                      counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

# setting distance_threshold=0 ensures we compute the full tree.
cls2 = AgglomerativeClustering(n_clusters=None, metric='euclidean', linkage='ward', distance_threshold=0)
cls2 = cls2.fit(tfidf_matrix.toarray())

plt.title('Hierarchical Clustering Dendrogram')
# plot the top three levels of the dendrogram
plot_dendrogram(cls2, truncate_mode='level', p=5)
plt.xlabel("Number of points in node (or index of point if no parenthesis).")
plt.show()

# here is another way of plotting the hc using the scipy library
import scipy.cluster.hierarchy as shc
plt.figure(figsize=(10, 7))
plt.title("Tfidf dendogram for hierarchical clustering")
dend = shc.dendrogram(shc.linkage(tfidf_matrix.toarray(), method='ward'))

LDA = LatentDirichletAllocation(n_components=5, random_state=321, evaluate_every=10)
LDA.fit(tfidf_matrix)

LatentDirichletAllocation(evaluate_every=10, n_components=5, random_state=321)

for i,topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([tfidf_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[-10:]])
    print('\n')

Top 10 words for topic #0:
['new', 'year', 'play', 'chelsea', 'cup', 'team', 'players', 'game', 'england', 'said']


Top 10 words for topic #1:
['women', 'championships', 'seed', 'holmes', 'open', 'world', 'indoor', 'race', 'olympic', 'champion']


Top 10 words for topic #2:
['regulations introduced', 'things ball', 'doosra fell', 'line given', 'seconds', 'need video', 'nice goal', 'mendes shot', 'harbhajan action', 'harbhajan']


Top 10 words for topic #3:
['henson', 'leicester', 'ruddock', 'drugs', 'france', 'thanou', 'iaaf', 'greek', 'kenteris', 'wales']


Top 10 words for topic #4:
['racism', 'rod', 'aragones', 'ak', 'saracens', 'idowu', 'edwards', 'umaga', 'kafer', 'hingis']

def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

no_top_words = 10
display_topics(LDA, tfidf_vectorizer.get_feature_names_out(), no_top_words=4)

topic_values = LDA.transform(tfidf_matrix)
topic_values.shape

(737, 5)

topic_values

array([[0.9459469 , 0.01355703, 0.01348593, 0.01352432, 0.01348582],
       [0.95645121, 0.0108862 , 0.01088773, 0.01088808, 0.01088679],
       [0.71607405, 0.21627131, 0.02249309, 0.0226196 , 0.02254195],
       ...,
       [0.67563452, 0.01603897, 0.01583429, 0.27665878, 0.01583344],
       [0.7030744 , 0.0173952 , 0.01712774, 0.24527598, 0.01712668],
       [0.70235222, 0.25269006, 0.01497634, 0.01500188, 0.01497951]],
      shape=(737, 5))

LDA10 = LatentDirichletAllocation(n_components=10, random_state=321, evaluate_every=10)
LDA10.fit(tfidf_matrix)
print("The log likelihood for the LDA model with 5 topics:", LDA.score(tfidf_matrix))
print("The log likelihood for the LDA model with 10 topics:", LDA10.score(tfidf_matrix))

The log likelihood for the LDA model with 5 topics: -92722.6451258006
The log likelihood for the LDA model with 10 topics: -106698.52544694052

	n_clusters n_clusters: int, default=8 The number of clusters to form as well as the number of centroids to generate.	5
	init init: {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++' Method for initialization: 'k-means++' : selects initial cluster centroids using sampling based on an empirical probability distribution of the points' contribution to the overall inertia. This technique speeds up convergence. The algorithm implemented is "greedy k-means++". It differs from the vanilla k-means++ by making several trials at each sampling step and choosing the best centroid among them. 'random': choose `n_clusters` observations (rows) at random from data for the initial centroids. If an array is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization. For an evaluation of the impact of initialization, see the example :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.	'k-means++'
	max_iter max_iter: int, default=100 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics.	100
	batch_size batch_size: int, default=1024 Size of the mini batches. For faster computations, you can set `batch_size > 256 * number_of_cores` to enable :ref:`parallelism ` on all cores. .. versionchanged:: 1.0 `batch_size` default changed from 100 to 1024.	1024
	verbose verbose: int, default=0 Verbosity mode.	0
	compute_labels compute_labels: bool, default=True Compute label assignment and inertia for the complete dataset once the minibatch optimization has converged in fit.	True
	random_state random_state: int, RandomState instance or None, default=None Determines random number generation for centroid initialization and random reassignment. Use an int to make the randomness deterministic. See :term:`Glossary `.	321
	tol tol: float, default=0.0 Control early stopping based on the relative center changes as measured by a smoothed, variance-normalized of the mean center squared position changes. This early stopping heuristics is closer to the one used for the batch variant of the algorithms but induces a slight computational and memory overhead over the inertia heuristic. To disable convergence detection based on normalized center change, set tol to 0.0 (default).	0.0
	max_no_improvement max_no_improvement: int, default=10 Control early stopping based on the consecutive number of mini batches that does not yield an improvement on the smoothed inertia. To disable convergence detection based on inertia, set max_no_improvement to None.	10
	init_size init_size: int, default=None Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accuracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than n_clusters. If `None`, the heuristic is `init_size = 3 * batch_size` if `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.	None
	n_init n_init: 'auto' or int, default="auto" Number of random initializations that are tried. In contrast to KMeans, the algorithm is only run once, using the best of the `n_init` initializations as measured by inertia. Several runs are recommended for sparse high-dimensional problems (see :ref:`kmeans_sparse_high_dim`). When `n_init='auto'`, the number of runs depends on the value of init: 3 if using `init='random'` or `init` is a callable; 1 if using `init='k-means++'` or `init` is an array-like. .. versionadded:: 1.2 Added 'auto' option for `n_init`. .. versionchanged:: 1.4 Default value for `n_init` changed to `'auto'` in version.	2
	reassignment_ratio reassignment_ratio: float, default=0.01 Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more easily reassigned, which means that the model will take longer to converge, but should converge in a better clustering. However, too high a value may cause convergence issues, especially with a small batch size.	0.01

	n_clusters n_clusters: int, default=8 The number of clusters to form as well as the number of centroids to generate.	5
	init init: {'k-means++', 'random'}, callable or array-like of shape (n_clusters, n_features), default='k-means++' Method for initialization: 'k-means++' : selects initial cluster centroids using sampling based on an empirical probability distribution of the points' contribution to the overall inertia. This technique speeds up convergence. The algorithm implemented is "greedy k-means++". It differs from the vanilla k-means++ by making several trials at each sampling step and choosing the best centroid among them. 'random': choose `n_clusters` observations (rows) at random from data for the initial centroids. If an array is passed, it should be of shape (n_clusters, n_features) and gives the initial centers. If a callable is passed, it should take arguments X, n_clusters and a random state and return an initialization. For an evaluation of the impact of initialization, see the example :ref:`sphx_glr_auto_examples_cluster_plot_kmeans_stability_low_dim_dense.py`.	'k-means++'
	max_iter max_iter: int, default=100 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion heuristics.	100
	batch_size batch_size: int, default=1024 Size of the mini batches. For faster computations, you can set `batch_size > 256 * number_of_cores` to enable :ref:`parallelism ` on all cores. .. versionchanged:: 1.0 `batch_size` default changed from 100 to 1024.	1024
	verbose verbose: int, default=0 Verbosity mode.	0
	compute_labels compute_labels: bool, default=True Compute label assignment and inertia for the complete dataset once the minibatch optimization has converged in fit.	True
	random_state random_state: int, RandomState instance or None, default=None Determines random number generation for centroid initialization and random reassignment. Use an int to make the randomness deterministic. See :term:`Glossary `.	321
	tol tol: float, default=0.0 Control early stopping based on the relative center changes as measured by a smoothed, variance-normalized of the mean center squared position changes. This early stopping heuristics is closer to the one used for the batch variant of the algorithms but induces a slight computational and memory overhead over the inertia heuristic. To disable convergence detection based on normalized center change, set tol to 0.0 (default).	0.0
	max_no_improvement max_no_improvement: int, default=10 Control early stopping based on the consecutive number of mini batches that does not yield an improvement on the smoothed inertia. To disable convergence detection based on inertia, set max_no_improvement to None.	10
	init_size init_size: int, default=None Number of samples to randomly sample for speeding up the initialization (sometimes at the expense of accuracy): the only algorithm is initialized by running a batch KMeans on a random subset of the data. This needs to be larger than n_clusters. If `None`, the heuristic is `init_size = 3 * batch_size` if `3 * batch_size < n_clusters`, else `init_size = 3 * n_clusters`.	None
	n_init n_init: 'auto' or int, default="auto" Number of random initializations that are tried. In contrast to KMeans, the algorithm is only run once, using the best of the `n_init` initializations as measured by inertia. Several runs are recommended for sparse high-dimensional problems (see :ref:`kmeans_sparse_high_dim`). When `n_init='auto'`, the number of runs depends on the value of init: 3 if using `init='random'` or `init` is a callable; 1 if using `init='k-means++'` or `init` is an array-like. .. versionadded:: 1.2 Added 'auto' option for `n_init`. .. versionchanged:: 1.4 Default value for `n_init` changed to `'auto'` in version.	2
	reassignment_ratio reassignment_ratio: float, default=0.01 Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more easily reassigned, which means that the model will take longer to converge, but should converge in a better clustering. However, too high a value may cause convergence issues, especially with a small batch size.	0.01

	n_components n_components: int, default=10 Number of topics. .. versionchanged:: 0.19 ``n_topics`` was renamed to ``n_components``	5
	doc_topic_prior doc_topic_prior: float, default=None Prior of document topic distribution `theta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `alpha`.	None
	topic_word_prior topic_word_prior: float, default=None Prior of topic word distribution `beta`. If the value is None, defaults to `1 / n_components`. In [1]_, this is called `eta`.	None
	learning_method learning_method: {'batch', 'online'}, default='batch' Method used to update `_component`. Only used in :meth:`fit` method. In general, if the data size is large, the online update will be much faster than the batch update. Valid options: - 'batch': Batch variational Bayes method. Use all training data in each EM update. Old `components_` will be overwritten in each iteration. - 'online': Online variational Bayes method. In each EM update, use mini-batch of training data to update the ``components_`` variable incrementally. The learning rate is controlled by the ``learning_decay`` and the ``learning_offset`` parameters. .. versionchanged:: 0.20 The default learning method is now ``"batch"``.	'batch'
	learning_decay learning_decay: float, default=0.7 It is a parameter that control learning rate in the online learning method. The value should be set between (0.5, 1.0] to guarantee asymptotic convergence. When the value is 0.0 and batch_size is ``n_samples``, the update method is same as batch learning. In the literature, this is called kappa.	0.7
	learning_offset learning_offset: float, default=10.0 A (positive) parameter that downweights early iterations in online learning. It should be greater than 1.0. In the literature, this is called tau_0.	10.0
	max_iter max_iter: int, default=10 The maximum number of passes over the training data (aka epochs). It only impacts the behavior in the :meth:`fit` method, and not the :meth:`partial_fit` method.	10
	batch_size batch_size: int, default=128 Number of documents to use in each EM iteration. Only used in online learning.	128
	evaluate_every evaluate_every: int, default=-1 How often to evaluate perplexity. Only used in `fit` method. set it to 0 or negative number to not evaluate perplexity in training at all. Evaluating perplexity can help you check convergence in training process, but it will also increase total training time. Evaluating perplexity in every iteration might increase training time up to two-fold.	10
	total_samples total_samples: int, default=1e6 Total number of documents. Only used in the :meth:`partial_fit` method.	1000000.0
	perp_tol perp_tol: float, default=1e-1 Perplexity tolerance. Only used when ``evaluate_every`` is greater than 0.	0.1
	mean_change_tol mean_change_tol: float, default=1e-3 Stopping tolerance for updating document topic distribution in E-step.	0.001
	max_doc_update_iter max_doc_update_iter: int, default=100 Max number of iterations for updating document topic distribution in the E-step.	100
	n_jobs n_jobs: int, default=None The number of jobs to use in the E-step. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.	None
	verbose verbose: int, default=0 Verbosity level.	0
	random_state random_state: int, RandomState instance or None, default=None Pass an int for reproducible results across multiple function calls. See :term:`Glossary `.	321

	Topic 0 words	Topic 0 weights	Topic 1 words	Topic 1 weights	Topic 2 words	Topic 2 weights	Topic 3 words	Topic 3 weights	Topic 4 words	Topic 4 weights
0	said	17.5	champion	5.6	harbhajan	0.6	wales	4.9	hingis	1.0
1	england	17.1	olympic	4.9	harbhajan action	0.5	kenteris	3.5	kafer	0.8
2	game	12.8	race	4.8	mendes shot	0.4	greek	3.1	umaga	0.8
3	players	11.4	indoor	4.8	nice goal	0.4	iaaf	3.1	edwards	0.8

Practical 5: Text Clustering and Topic Modeling¶

Text Mining, Transforming Text into Knowledge (202400006)¶

Let's get started!¶

K-Means clustering¶

Hierarchical clustering (optional)¶

Topic modeling¶

	text
0	England victory tainted by history\n\nAs Engla...
1	Australia complete sweep\n\nThird Test, Sydney...
2	UK Athletics agrees new kit deal\n\nUK Athleti...
3	Bekele sets sights on world mark\n\nOlympic 10...
4	Captains lining up for Aid match\n\nIreland's ...