Examples
This page contains runnable examples demonstrating different ways to use scikit-clarans and integrations with popular Python tooling.
Quickstart
A minimal quickstart example:
"""
01_quick_start.py
=================
A compact, runnable example showing a simple CLARANS workflow:
- Generate 2D blob data
- Fit CLARANS
- Print medoid indices and labels
- Plot the resulting clusters and medoids
Run this script with: python examples/01_quick_start.py
"""
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from clarans import CLARANS
def main():
X, _ = make_blobs(n_samples=500, centers=4, n_features=2, random_state=42)
model = CLARANS(n_clusters=4, numlocal=3, init="k-medoids++", random_state=42)
model.fit(X)
print("Medoid Indices:", model.medoid_indices_)
print("First 10 Labels:", model.labels_[:10])
# Visualization
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap="tab10", s=20, alpha=0.7)
plt.scatter(
model.cluster_centers_[:, 0],
model.cluster_centers_[:, 1],
c="black",
marker="*",
s=200,
label="Medoids",
)
plt.title("CLARANS quick start: clusters and medoids")
plt.legend()
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
Compare initializations
Use this script to compare initialization strategies and runtimes:
"""
02_compare_initializations.py
=============================
Compare different initialization strategies available in CLARANS and report
final clustering cost for each method.
Run with: python examples/02_compare_initializations.py
"""
import time
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from clarans import CLARANS
from clarans.utils import calculate_cost
def main():
X, _ = make_blobs(n_samples=800, centers=4, n_features=2, random_state=1)
inits = ["random", "k-medoids++", "heuristic", "build"]
results = []
for init in inits:
t0 = time.time()
model = CLARANS(n_clusters=4, numlocal=3, init=init, random_state=42)
model.fit(X)
t1 = time.time()
cost = calculate_cost(X, model.medoid_indices_, metric=model.metric)
results.append((init, cost, t1 - t0))
print(f"init={init:12s} cost={cost:.2f} time={t1-t0:.3f}s")
# Simple bar plots
inits, costs, times = zip(*results)
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(inits, costs, color="C1")
plt.title("Final clustering cost by init method")
plt.ylabel("Cost")
plt.subplot(1, 2, 2)
plt.bar(inits, times, color="C2")
plt.title("Runtime by init method")
plt.ylabel("Time (s)")
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
Different distance metrics
See how different metrics affect the clustering result:
"""
03_metrics_demo.py
==================
Demonstrate how different distance metrics affect CLARANS clustering.
Run with: python examples/03_metrics_demo.py
"""
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from clarans import CLARANS
from clarans.utils import calculate_cost
def main():
X, _ = make_blobs(n_samples=500, centers=4, n_features=2, random_state=0)
metrics = ["euclidean", "manhattan", "cosine"]
costs = []
models = []
for metric in metrics:
model = CLARANS(
n_clusters=4, numlocal=3, init="k-medoids++", metric=metric, random_state=42
)
model.fit(X)
cost = calculate_cost(X, model.medoid_indices_, metric=metric)
costs.append(cost)
models.append(model)
print(f"metric={metric:9s} cost={cost:.2f}")
# Plot clustering results for each metric
fig, axes = plt.subplots(1, len(metrics), figsize=(15, 4))
for ax, metric, model in zip(axes, metrics, models):
ax.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap="tab10", s=20)
ax.scatter(
model.cluster_centers_[:, 0],
model.cluster_centers_[:, 1],
c="black",
marker="*",
s=150,
)
ax.set_title(metric)
plt.suptitle("Effect of distance metric on CLARANS clustering")
plt.tight_layout()
plt.show()
if __name__ == "__main__":
main()
Using sparse inputs
"""
04_sparse_input.py
==================
Show that CLARANS accepts sparse CSR matrices as input.
Run with: python examples/04_sparse_input.py
"""
import numpy as np
from scipy import sparse
from sklearn.datasets import make_blobs
from clarans import CLARANS
def main():
X, _ = make_blobs(n_samples=400, centers=3, n_features=6, random_state=0)
# Make the matrix sparse by zeroing-out small values
X[np.abs(X) < 1.0] = 0
X_sparse = sparse.csr_matrix(X)
model = CLARANS(n_clusters=3, init="k-medoids++", random_state=0)
model.fit(X_sparse)
print("Medoid indices:", model.medoid_indices_)
print("Cluster centers (medoids); shape:", model.cluster_centers_.shape)
if __name__ == "__main__":
main()
Grid-search and pipelines
"""
05_pipeline_gridsearch.py
==================
Demonstrates the CLARANS workflow with scikit-learn's GridSearchCV.
Run with: python examples/05_pipeline_gridsearch.py
"""
import warnings
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from clarans import CLARANS
X, _ = make_blobs(n_samples=500, centers=4, n_features=10, random_state=42)
def clustering_silhouette_scorer(estimator, X):
labels = estimator.predict(X)
if len(set(labels)) < 2:
return -1.0
return silhouette_score(X, labels)
param_grid = {
"n_clusters": [3, 4, 5],
"numlocal": [2, 5, 10],
"init": ["k-medoids++", "random", "heuristic"],
"maxneighbor": [None, 50],
}
grid_search = GridSearchCV(
estimator=CLARANS(random_state=42),
param_grid=param_grid,
scoring=clustering_silhouette_scorer,
cv=3,
verbose=1,
n_jobs=-1,
)
print("Starting grid search (GridSearchCV)...")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
grid_search.fit(X)
print("\n" + "=" * 50)
print("GRID SEARCH RESULTS")
print("=" * 50)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best mean silhouette score: {grid_search.best_score_:.4f}")
best_model = grid_search.best_estimator_
print("\nBest model information:")
print(f" - Number of medoids found: {len(best_model.medoid_indices_)}")
print(f" - Medoid indices: {best_model.medoid_indices_}")
print("\nTop 3 best configurations:")
results = grid_search.cv_results_
indices = np.argsort(results["mean_test_score"])[::-1][:3]
for i in indices:
print(
f" Rank {results['rank_test_score'][i]}: "
f"Score={results['mean_test_score'][i]:.4f} | "
f"Params={results['params'][i]}"
)
Additional Resources
Additional examples are available in the examples directory.
The examples directory also includes an interactive Jupyter notebook with many examples: