👓 Cluster Analysis#

Plotting Cluster Distributions#

# remove `!` if running the line in a terminal
!pip install -U RelevanceAI[notebook]==2.0.0
from relevanceai import Client

"""
You can sign up/login and find your credentials here: https://cloud.relevance.ai/sdk/api
Once you have signed up, click on the value under `Activation token` and paste it here
"""
client = Client()
from relevanceai.utils.datasets import get_titanic_dataset

documents = get_titanic_dataset()
for i, doc in enumerate(documents):
    doc["_id"] = i
ds = client.Dataset("titanic")
ds.insert_documents(documents)

Clustering#

from sklearn.cluster import KMeans

VECTOR_FIELD = "value_vector_"
KMEAN_NUMBER_OF_CLUSTERS = 5
ALIAS = "kmeans_" + str(KMEAN_NUMBER_OF_CLUSTERS)

model = KMeans(n_clusters=KMEAN_NUMBER_OF_CLUSTERS)
clusterer = client.ClusterOps(alias=ALIAS, model=model)
clusterer.run(dataset_id="titanic", vector_fields=["value_vector_"])

Plot Basic Distributions#

viz_ops = client.ClusterVizOps(
    dataset_id="titanic", vector_fields=["value_vector_"], alias="kmeans_5"
)
viz_ops.plot_distributions("Age", top_indices=3)
../_images/cluster_distribution_guide_10_0.png ../_images/cluster_distribution_guide_10_1.png ../_images/cluster_distribution_guide_10_2.png

Plotting Custom Distributions - Variation#

from scipy.stats import skew, variation


viz_ops.plot_distributions(
    numeric_field="Age", dataset_id="titanic", measure_function=variation, top_indices=3
)
0%|          | 0/5 [00:00<?, ?it/s]
../_images/cluster_distribution_guide_12_1.png ../_images/cluster_distribution_guide_12_2.png ../_images/cluster_distribution_guide_12_3.png
viz_ops.plot_distributions(
    numeric_field="Age", dataset_id="titanic", measure_function=skew, top_indices=2
)
0%|          | 0/5 [00:00<?, ?it/s]
../_images/cluster_distribution_guide_13_1.png ../_images/cluster_distribution_guide_13_2.png