Batch Clustering#

class relevanceai.operations.cluster.partial.PartialClusterOps#
partial_fit_dataset(dataset_id, vector_fields, chunksize=100, filters=None)#

Fit The dataset by partial documents.

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset")

from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters=2)
cluster_ops = client.ClusterOps(alias="minibatchkmeans_2", model=model)

cluster_ops.partial_fit_dataset(df, vector_fields=["documentation_vector_"])
partial_fit_documents(vector_fields, documents)#

Train clustering algorithm on documents and then store the labels inside the documents.

Parameters
  • vector_field (list) – The vector field of the documents

  • docs (list) – List of documents to run clustering on

  • alias (str) – What the clusters can be called

  • cluster_field (str) – What the cluster fields should be called

  • return_only_clusters (bool) – If True, return only clusters, otherwise returns the original document

  • inplace (bool) – If True, the documents are edited inplace otherwise, a copy is made first

  • kwargs (dict) – Any other keyword argument will go directly into the clustering algorithm

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset")

from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters=2)
cluster_ops = client.ClusterOps(alias="batchkmeans_2", model=model)

cluster_ops.parital_fit(df, vector_fields=["documentation_vector_"])
cluster_ops.predict_update(df, vector_fields=["sample_vector_"])
partial_fit_predict_update(dataset_id, vector_fields=None, chunksize=100, filters=None, verbose=True)#

Fit, predict and update on a dataset. Users can also start to run these separately one by one.

Parameters
  • dataset (Union[Dataset]) – The dataset class

  • vector_fields (List[str]) – The list of vector fields

  • chunksize (int) – The size of the chunks

Example

# Real-life example from Research Dashboard
from relevanceai import Client
client = Client()
df = client.Dataset("research2vec")

from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters=50)
cluster_ops = client.ClusterOps(alias="minibatchkmeans_50", model=model)

cluster_ops.partial_fit_predict_update(
    df,
    vector_fields=['title_trainedresearchqgen_vector_'],
    chunksize=1000
)
class relevanceai.operations.cluster.sub.SubClusterOps#
fit_predict(dataset, vector_fields, parent_field=None, filters=None, verbose=False, min_parent_cluster_size=None, cluster_ids=None)#

Run subclustering on your dataset using an in-memory clustering algorithm.

Parameters
  • dataset (Dataset) – The dataset to create

  • vector_fields (List) – The list of vector fields to run fitting, prediction and updating on

  • filters (Optional[List]) – The list of filters to run clustering on

  • verbose (bool) – If True, this should be verbose

Example

from relevanceai import Client
client = Client()

from relevanceai.package_utils.datasets import mock_documents
ds = client.Dataset("sample")

# Creates 100 sample documents
documents = mock_documents(100)
ds.upsert_documents(documents)

from sklearn.cluster import KMeans
model = KMeans(n_clusters=10)
clusterer = ClusterOps(alias="minibatchkmeans-10", model=model)
clusterer.subcluster_predict_update(
    dataset=ds,
)
list_unique(field=None, minimum_amount=3, dataset_id=None, num_clusters=1000)#

List unique cluster IDS

Example

from relevanceai import Client
client = Client()
cluster_ops = client.ClusterOps(
    alias="kmeans_8", vector_fields=["sample_vector_]
)
cluster_ops.list_unique()
Parameters
  • alias (str) – The alias to use for clustering

  • minimum_cluster_size (int) – The minimum size of the clusters

  • dataset_id (str) – The dataset ID

  • num_clusters (int) – The number of clusters

store_subcluster_metadata(parent_field, cluster_field)#

Store subcluster metadata

subcluster_predict_documents(vector_fields=None, filters=None, min_parent_cluster_size=None, cluster_ids=None, verbose=True)#

Subclustering using fit predict update. This will loop through all of the different clusters and then run subclustering on them. For this, you need to

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("sample")

# Creating 100 sample documents
from relevanceai.package_utils.datasets import mock_documents
documents = mock_documents(100)
ds.upsert_documents(documents)

# Run simple clustering first
ds.auto_cluster("kmeans-3", vector_fields=["sample_1_vector_"])

# Start KMeans
from sklearn.cluster import KMeans
model = KMeans(n_clusters=20)

# Run subclustering.
cluster_ops = client.ClusterOps(
    alias="subclusteringkmeans",
    model=model,
    parent_alias="kmeans-3")
subpartialfit_predict_update(dataset, vector_fields, filters=None, cluster_ids=None, verbose=True)#

Run partial fit subclustering on your dataset.

Parameters
  • dataset (Dataset) – The dataset to call fit predict update on

  • vector_fields (list) – The list of vector fields

  • filters (list) – The list of filters

Example

from relevanceai import Client
client = Client()

from relevanceai.package_utils.datasets import mock_documents
ds = client.Dataset("sample")
# Creates 100 sample documents
documents = mock_documents(100)
ds.upsert_documents(documents)

from sklearn.cluster import MiniBatchKMeans
model = MiniBatchKMeans(n_clusters=10)
clusterer = ClusterOps(alias="minibatchkmeans-10", model=model)
clusterer.subpartialfit_predict_update(
    dataset=ds,
)