👓 Cluster Analysis#
Plotting Cluster Distributions#
# remove `!` if running the line in a terminal
!pip install -U RelevanceAI[notebook]==2.0.0
from relevanceai import Client
"""
You can sign up/login and find your credentials here: https://cloud.tryrelevance.com/sdk/api
Once you have signed up, click on the value under `Activation token` and paste it here
"""
client = Client()
from relevanceai.utils.datasets import get_titanic_dataset
documents = get_titanic_dataset()
for i, doc in enumerate(documents):
doc["_id"] = i
ds = client.Dataset("titanic")
ds.insert_documents(documents)
Clustering#
from sklearn.cluster import KMeans
VECTOR_FIELD = "value_vector_"
KMEAN_NUMBER_OF_CLUSTERS = 5
ALIAS = "kmeans_" + str(KMEAN_NUMBER_OF_CLUSTERS)
model = KMeans(n_clusters=KMEAN_NUMBER_OF_CLUSTERS)
clusterer = client.ClusterOps(alias=ALIAS, model=model)
clusterer.run(dataset_id="titanic", vector_fields=["value_vector_"])
Plot Basic Distributions#
viz_ops = client.ClusterVizOps(
dataset_id="titanic", vector_fields=["value_vector_"], alias="kmeans_5"
)
viz_ops.plot_distributions("Age", top_indices=3)



Plotting Custom Distributions - Variation#
from scipy.stats import skew, variation
viz_ops.plot_distributions(
numeric_field="Age", dataset_id="titanic", measure_function=variation, top_indices=3
)
0%| | 0/5 [00:00<?, ?it/s]



viz_ops.plot_distributions(
numeric_field="Age", dataset_id="titanic", measure_function=skew, top_indices=2
)
0%| | 0/5 [00:00<?, ?it/s]

