Source code for clarans.initialization

import warnings
import numpy as np
from sklearn.metrics import pairwise_distances
from sklearn.utils import check_random_state


[docs] def initialize_heuristic(X, n_clusters, metric="euclidean"): """ Initialize medoids using a heuristic approach. Picks the n_clusters points with the smallest sum distance to every other point. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. n_clusters : int The number of clusters to form. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. Returns ------- current_medoids_indices : ndarray of shape (n_clusters,), dtype int Indices of the selected medoids in the dataset. Notes ----- This method computes the full pairwise distance matrix and therefore has O(n^2) time and memory complexity. """ if X.shape[0] >= 50000: warnings.warn( f"The 'heuristic' initialization involves a full distance matrix calculation " f"for {X.shape[0]} samples. This can be extremely slow and memory-intensive (O(N^2)).", UserWarning, ) # This requires O(N^2) complexity D = pairwise_distances(X, metric=metric) dist_sums = np.sum(D, axis=1) current_medoids_indices = np.argpartition(dist_sums, n_clusters - 1)[:n_clusters] return current_medoids_indices
[docs] def initialize_build(X, n_clusters, metric="euclidean"): """ Initialize medoids using the PAM BUILD step. Greedily selects the first medoid that minimizes total distance, then subsequently adds medoids that maximally decrease the total cost. Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. n_clusters : int The number of clusters to form. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. Returns ------- medoids : ndarray of shape (n_clusters,), dtype int Indices of the selected medoids in the dataset. """ n_samples = X.shape[0] if n_samples >= 25000: warnings.warn( f"The 'build' initialization involves a full distance matrix calculation " f"for {n_samples} samples. This can be extremely slow and memory-intensive (O(N^2)).", UserWarning, ) medoids = [] # Calculate full distance matrix potentially (expensive) or compute on fly # Using euclidean_distances or pairwise_distances D = pairwise_distances(X, metric=metric) dist_sums = D.sum(axis=1) first_medoid = np.argmin(dist_sums) medoids.append(int(first_medoid)) dist_to_nearest = D[:, first_medoid] for _ in range(1, n_clusters): # Mask for medoids is_medoid = np.zeros(n_samples, dtype=bool) is_medoid[medoids] = True # We only care about candidates candidate_indices = np.where(~is_medoid)[0] D_candidates = D[:, candidate_indices] diffs = dist_to_nearest[:, np.newaxis] - D_candidates gains = np.sum(np.maximum(diffs, 0), axis=0) best_candidate_idx_in_candidates = np.argmax(gains) best_candidate = candidate_indices[best_candidate_idx_in_candidates] medoids.append(int(best_candidate)) dist_to_nearest = np.minimum(dist_to_nearest, D[:, best_candidate]) return np.array(medoids)
[docs] def initialize_k_medoids_plus_plus( X, n_clusters, random_state=None, metric="euclidean", n_local_trials=None ): """ Initialize medoids using k-medoids++ (similar to k-means++). Parameters ---------- X : {array-like, sparse matrix} of shape (n_samples, n_features) The input samples. n_clusters : int The number of clusters to form. random_state : int, RandomState instance or None, default=None Determines random number generation for centroid initialization. metric : str or callable, default='euclidean' The metric to use when calculating distance between instances in a feature array. n_local_trials : int, default=None The number of seeding trials for each center (except the first), of which the one reducing inertia the most is greedily chosen. If None, the function uses a small logarithmic default. Returns ------- medoids : ndarray of shape (n_clusters,), dtype int Indices of the selected medoids in the dataset. Notes ----- This implementation follows the k-means++ style seeding but uses distances squared and picks medoids (data indices) rather than centroids. """ random_state = check_random_state(random_state) n_samples = X.shape[0] medoid_indices = np.empty(n_clusters, dtype=int) if n_local_trials is None: n_local_trials = 2 + int(np.log(n_clusters)) first_medoid = random_state.randint(0, n_samples) medoid_indices[0] = first_medoid closest = pairwise_distances( X, X[first_medoid].reshape(1, -1), metric=metric, ).flatten() closest_dist_sq = closest**2 current_pot = closest_dist_sq.sum() for c in range(1, n_clusters): rand_vals = random_state.random_sample(n_local_trials) * current_pot cumsum_dist = np.cumsum(closest_dist_sq) candidate_ids = np.searchsorted(cumsum_dist, rand_vals) np.clip(candidate_ids, 0, n_samples - 1, out=candidate_ids) # Compute distances from candidates to all points candidates_X = X[candidate_ids] dists_candidates = pairwise_distances(candidates_X, X, metric=metric) ** 2 best_candidate = None best_pot = None best_dist_sq = None for i in range(n_local_trials): new_dist_sq = np.minimum(closest_dist_sq, dists_candidates[i]) new_pot = new_dist_sq.sum() if best_candidate is None or new_pot < best_pot: best_candidate = candidate_ids[i] best_pot = new_pot best_dist_sq = new_dist_sq medoid_indices[c] = best_candidate current_pot = best_pot closest_dist_sq = best_dist_sq return medoid_indices