Batch Clustering#
- class relevanceai.operations.cluster.partial.PartialClusterOps#
- partial_fit_dataset(dataset_id, vector_fields, chunksize=100, filters=None)#
Fit The dataset by partial documents.
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset") from sklearn.cluster import MiniBatchKMeans model = MiniBatchKMeans(n_clusters=2) cluster_ops = client.ClusterOps(alias="minibatchkmeans_2", model=model) cluster_ops.partial_fit_dataset(df, vector_fields=["documentation_vector_"])
- partial_fit_documents(vector_fields, documents)#
Train clustering algorithm on documents and then store the labels inside the documents.
- Parameters
vector_field (list) – The vector field of the documents
docs (list) – List of documents to run clustering on
alias (str) – What the clusters can be called
cluster_field (str) – What the cluster fields should be called
return_only_clusters (bool) – If True, return only clusters, otherwise returns the original document
inplace (bool) – If True, the documents are edited inplace otherwise, a copy is made first
kwargs (dict) – Any other keyword argument will go directly into the clustering algorithm
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset") from sklearn.cluster import MiniBatchKMeans model = MiniBatchKMeans(n_clusters=2) cluster_ops = client.ClusterOps(alias="batchkmeans_2", model=model) cluster_ops.parital_fit(df, vector_fields=["documentation_vector_"]) cluster_ops.predict_update(df, vector_fields=["sample_vector_"])
- partial_fit_predict_update(dataset_id, vector_fields=None, chunksize=100, filters=None, verbose=True)#
Fit, predict and update on a dataset. Users can also start to run these separately one by one.
- Parameters
dataset (Union[Dataset]) – The dataset class
vector_fields (List[str]) – The list of vector fields
chunksize (int) – The size of the chunks
Example
# Real-life example from Research Dashboard from relevanceai import Client client = Client() df = client.Dataset("research2vec") from sklearn.cluster import MiniBatchKMeans model = MiniBatchKMeans(n_clusters=50) cluster_ops = client.ClusterOps(alias="minibatchkmeans_50", model=model) cluster_ops.partial_fit_predict_update( df, vector_fields=['title_trainedresearchqgen_vector_'], chunksize=1000 )
- class relevanceai.operations.cluster.sub.SubClusterOps#
- fit_predict(dataset, vector_fields, parent_field=None, filters=None, verbose=False, min_parent_cluster_size=None, cluster_ids=None)#
Run subclustering on your dataset using an in-memory clustering algorithm.
- Parameters
dataset (Dataset) – The dataset to create
vector_fields (List) – The list of vector fields to run fitting, prediction and updating on
filters (Optional[List]) – The list of filters to run clustering on
verbose (bool) – If True, this should be verbose
Example
from relevanceai import Client client = Client() from relevanceai.package_utils.datasets import mock_documents ds = client.Dataset("sample") # Creates 100 sample documents documents = mock_documents(100) ds.upsert_documents(documents) from sklearn.cluster import KMeans model = KMeans(n_clusters=10) clusterer = ClusterOps(alias="minibatchkmeans-10", model=model) clusterer.subcluster_predict_update( dataset=ds, )
- list_unique(field=None, minimum_amount=3, dataset_id=None, num_clusters=1000)#
List unique cluster IDS
Example
from relevanceai import Client client = Client() cluster_ops = client.ClusterOps( alias="kmeans_8", vector_fields=["sample_vector_] ) cluster_ops.list_unique()
- Parameters
alias (str) – The alias to use for clustering
minimum_cluster_size (int) – The minimum size of the clusters
dataset_id (str) – The dataset ID
num_clusters (int) – The number of clusters
- store_subcluster_metadata(parent_field, cluster_field)#
Store subcluster metadata
- subcluster_predict_documents(vector_fields=None, filters=None, min_parent_cluster_size=None, cluster_ids=None, verbose=True)#
Subclustering using fit predict update. This will loop through all of the different clusters and then run subclustering on them. For this, you need to
Example
from relevanceai import Client client = Client() ds = client.Dataset("sample") # Creating 100 sample documents from relevanceai.package_utils.datasets import mock_documents documents = mock_documents(100) ds.upsert_documents(documents) # Run simple clustering first ds.auto_cluster("kmeans-3", vector_fields=["sample_1_vector_"]) # Start KMeans from sklearn.cluster import KMeans model = KMeans(n_clusters=20) # Run subclustering. cluster_ops = client.ClusterOps( alias="subclusteringkmeans", model=model, parent_alias="kmeans-3")
- subpartialfit_predict_update(dataset, vector_fields, filters=None, cluster_ids=None, verbose=True)#
Run partial fit subclustering on your dataset.
- Parameters
dataset (Dataset) – The dataset to call fit predict update on
vector_fields (list) – The list of vector fields
filters (list) – The list of filters
Example
from relevanceai import Client client = Client() from relevanceai.package_utils.datasets import mock_documents ds = client.Dataset("sample") # Creates 100 sample documents documents = mock_documents(100) ds.upsert_documents(documents) from sklearn.cluster import MiniBatchKMeans model = MiniBatchKMeans(n_clusters=10) clusterer = ClusterOps(alias="minibatchkmeans-10", model=model) clusterer.subpartialfit_predict_update( dataset=ds, )