Density-based clustering captures arbitrarily-shaped clusters and handles noise. DBSCAN is simpler, HDBSCAN builds a hierarchical density model and is more robust to variable densities. Evaluate with a mix of internal scores, external scores (when labels exist), stability/persistence, and domain-specific validation (visualization, cluster sizes, outliers).
DBSCAN
HDBSCAN
Preprocessing
Complexity
# pip install scikit-learn hdbscan umap-learn
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score
import hdbscan
# X: (n_samples, n_features) numpy array
X_scaled = StandardScaler().fit_transform(X)
# k-distance plot for DBSCAN (k = min_samples)
k = 5
nbrs = NearestNeighbors(n_neighbors=k).fit(X_scaled)
distances, _ = nbrs.kneighbors(X_scaled)
k_distances = np.sort(distances[:, -1])
# plot k_distances and pick eps at elbow
# DBSCAN
db = DBSCAN(eps=0.5, min_samples=k, metric='euclidean').fit(X_scaled)
labels_db = db.labels_ # -1 = noise
# HDBSCAN
hdb = hdbscan.HDBSCAN(min_cluster_size=10, min_samples=5, metric='euclidean')
hdb.fit(X_scaled)
labels_hdb = hdb.labels_
probabilities = hdb.probabilities_ # soft membership
outlier_scores = hdb.outlier_scores_
persistence = hdb.cluster_persistence_
Metrics (example)
# internal
sil_db = silhouette_score(X_scaled[labels_db != -1], labels_db[labels_db != -1])
# external (if y_true exists)
ari_hdb = adjusted_rand_score(y_true, labels_hdb)
ami_hdb = adjusted_mutual_info_score(y_true, labels_hdb)