relevanceai.operations_new.dataset_ops
#
RelevanceAI Operations wrappers for use from a Dataset object
from relevanceai import Client
client = Client()
dataset = client.Dataset()
dataset.vectorize_text(*args, **kwargs)
dataset.reduce_dims(*args, **kwargs)
dataset.cluster(*args **kwarsgs)
Module Contents#
- relevanceai.operations_new.dataset_ops.get_ptp_args() List[str] #
Returns all arguments in the PullTransformPush.__init__ func
- class relevanceai.operations_new.dataset_ops.Operations(*args, **kw)#
A Pandas Like datatset API for interacting with the RelevanceAI python package
- reduce_dims(self, vector_fields: List[str], n_components: int = 3, batched: bool = False, model: Optional[Any] = None, model_kwargs: Optional[dict] = None, alias: Optional[str] = None, filters: Optional[list] = None, chunksize: Optional[int] = 100, output_field: str = None, **kwargs)#
It takes a list of fields, a list of models, a list of filters, and a chunksize, and then runs the DimReductionOps class on the documents in the dataset
- Parameters
fields (List[str]) – List[str]
models (Optional[List[Any]]) – List[Any] = None,
filters (Optional[List[Dict[str, Any]]]) – A list of dictionaries, each dictionary containing a filter.
chunksize (int, optional) – The number of documents to process at a time.
- Return type
Nothing is being returned.
- vectorize_text(self, fields: List[str], batched: bool = True, models: Optional[List[Any]] = None, filters: Optional[list] = None, chunksize: Optional[int] = None, output_fields: list = None, **kwargs)#
It takes a list of fields, a list of models, a list of filters, and a chunksize, and then it runs the VectorizeOps function on the documents in the database
- Parameters
fields (List[str]) – List[str]
models (List[Any]) – List[Any]
filters (List[Dict[str, Any]]) – List[Dict[str, Any]]
chunksize (int, optional) – int = 100,
- Return type
Nothing
- vectorize_image(self, fields: List[str], models: Optional[List[Any]] = None, batched: bool = True, filters: Optional[list] = None, chunksize: Optional[int] = 20, **kwargs)#
It takes a list of fields, a list of models, a list of filters, and a chunksize, and then it runs the VectorizeOps function on the documents in the database
- Parameters
fields (List[str]) – List[str]
models (List[Any]) – List[Any]
filters (List[Dict[str, Any]]) – List[Dict[str, Any]]
chunksize (int, optional) – int = 100,
- Return type
Nothing
- label(self, vector_fields: List[str], label_documents: List[Any], expanded: bool = True, max_number_of_labels: int = 1, similarity_metric: str = 'cosine', similarity_threshold: float = 0, label_field: str = 'label', label_vector_field: str = 'label_vector_', batched: bool = True, filters: Optional[list] = None, chunksize: Optional[int] = 100, output_field: str = None, **kwargs)#
This function takes a list of documents, a list of vector fields, and a list of label documents, and then it labels the documents with the label documents
- Parameters
vector_fields (List[str]) – List[str]
label_documents (List[Any]) – List[Any]
expanded – If True, the label_vector_field will be a list of vectors. If False, the label_vector_field
optional – If True, the label_vector_field will be a list of vectors. If False, the label_vector_field
vector. (will be a single) –
max_number_of_labels (int, optional) – int = 1,
similarity_metric (str, optional) – str = “cosine”,
filters (Optional[list]) – A list of filters to apply to the documents.
chunksize (int, optional) – The number of documents to process at a time.
similarity_threshold (float, optional) – float = 0,
label_field (str, optional) – The name of the field that will contain the label.
label_vector_field – The field that will be added to the documents that contain the label vector.
optional – The field that will be added to the documents that contain the label vector.
- Return type
The return value is a list of documents.
- label_from_dataset(self, vector_fields: list, label_dataset, max_number_of_labels: int = 1, label_vector_field='label_vector_', expanded: bool = False, similarity_metric: str = 'cosine', label_field: str = 'label', batched: bool = True, filters: list = None, similarity_threshold=0.1, chunksize: int = 100, output_field: str = None, **kwargs)#
Label from another dataset
- split_sentences(self, text_fields: List[str], output_field='_splittextchunk_', language: str = 'en', inplace: bool = True, batched: bool = False, filters: Optional[list] = None, chunksize: Optional[int] = 100, **kwargs)#
This function splits the text in the text_field into sentences and stores the sentences in the output_field
- Parameters
text_field (str) – The field in the documents that contains the text to be split into sentences.
output_field – The name of the field that will contain the split sentences.
optional – The name of the field that will contain the split sentences.
language (str, optional) – The language of the text. This is used to determine the sentence splitting rules.
- cluster(self, vector_fields: List[str], model: Optional[Any] = None, alias: Optional[str] = None, model_kwargs: Optional[Dict[str, Any]] = None, chunksize: Optional[int] = 128, filters: Optional[list] = None, batched: bool = False, include_cluster_report: bool = False, **kwargs)#
cluster is a function that takes in a list of vector fields, a model, an alias, a list of filters, a boolean value, a dictionary of model keyword arguments, and a list of keyword arguments. It returns an object of type ClusterOps
Example
from sklearn.cluster import KMeans model = KMeans() from relevanceai import Client client = Client() ds = client.Dataset("sample") cluster_ops = ds.cluster( model=model, vector_fields=["sample_vector_"], alias="kmeans-8" )
- Parameters
vector_fields (List[str]) – A list of possible vector fields
model (Optional[Any]) – The clustering model to use. Currently, we support KMeans and MiniBatchKMeans.
alias (Optional[str]) – The name of the cluster model.
filters (Optional[list]) – Optional[list] = None,
include_cluster_report (bool, optional) – bool = True
model_kwargs (Optional[Dict[str, Any]]) – The cluster config to use You can change the number of clusters for kmeans using: cluster_config={“n_clusters”: 10}. For a full list of possible parameters for different models, simply check how the cluster models are instantiated.
- Return type
The cluster object
- batch_cluster(self, vector_fields: List[str], model: Any = None, alias: Optional[str] = None, filters: Optional[list] = None, model_kwargs: Dict = None, chunksize: int = 128, **kwargs)#
- extract_sentiment(self, text_fields: List[str], model_name: str = 'cardiffnlp/twitter-roberta-base-sentiment', highlight: bool = False, max_number_of_shap_documents: int = 1, min_abs_score: float = 0.1, sensitivity: float = 0, filters: Optional[list] = None, output_fields: list = None, chunksize: int = 128, batched: bool = True, **kwargs)#
Extract sentiment from the dataset
If you are dealing with news sources, you will want more sensitivity, as more news sources are likely to be neutral
- extract_emotion(self, text_fields: list, model_name='joeddav/distilbert-base-uncased-go-emotions-student', filters: list = None, chunksize: int = 100, output_fields: list = None, min_score: float = 0.3, batched: bool = True, refresh: bool = False, **kwargs)#
Extract an emotion.
from relevanceai import Client client = Client() ds = client.Dataset("sample") ds.extract_emotion( text_fields=["sample_1_label"], )
- apply_transformers_pipeline(self, text_fields: list, pipeline, output_fields: Optional[List[str]] = None, filters: Optional[list] = None, refresh: bool = False, **kwargs)#
Apply a transformers pipeline generically.
from transformers import pipeline pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0) ds.apply_transformers_pipeline( text_fields, pipeline )
- scale(self, vector_fields: List[str], model: Optional[str] = 'standard', alias: Optional[str] = None, model_kwargs: Optional[dict] = None, filters: Optional[list] = None, batched: Optional[bool] = None, chunksize: Optional[int] = None, **kwargs)#
- subcluster(self, vector_fields: List[str], alias: str, parent_field: str, model: Any = 'kmeans', cluster_field: str = '_cluster_', model_kwargs: Optional[dict] = None, filters: Optional[list] = None, cluster_ids: Optional[list] = None, min_parent_cluster_size: int = 0, **kwargs)#
- byo_cluster(self, vector_fields: list, alias: str, byo_cluster_field: str, centroids: list = None)#
Bring your own clusters and we can calculate the centroids for you.
Example
dataset = client.Dataset("retail_reviews") cluster_ops = dataset.byo_cluster( vector_fields=['reviews.title_mpnet_vector_'], alias="manufacturer_two", byo_cluster_field="manufacturer" )
- clean_text(self, text_fields: list, output_fields: list = None, remove_html_tags: bool = True, lower=False, remove_punctuation=True, remove_digits=True, remove_stopwords: list = None, lemmatize: bool = False, filters: list = None, replace_words: dict = None, **kwargs)#
Cleans text for you!
- count_text(self, text_fields: list, count_words: bool = True, count_characters: bool = True, count_sentences: bool = True, filters: list = None, chunksize: int = 1000, refresh: bool = False, **kwargs)#
- analyze_text(self, fields: list, vector_fields: list = None, vectorize=True, vectorize_models: list = None, cluster: bool = True, cluster_model=None, cluster_alias: str = None, subcluster: bool = True, subcluster_model=None, subcluster_alias: str = None, subcluster_parent_field: str = None, extract_sentiment: bool = True, extract_emotion: bool = False, count: bool = True, verbose: bool = False, filters: list = None)#
- analyze_vectors(self, vector_fields: list = None, cluster: bool = False, cluster_model=None, cluster_alias: str = None, subcluster: bool = False, subcluster_alias: str = None, subcluster_parent_field: str = None, subcluster_model=None, filters: list = None)#
- extract_keywords(self, fields: list, model_name: str = 'all-mpnet-base-v2', output_fields: list = None, lower_bound: int = 0, upper_bound: int = 3, chunksize: int = 200, max_keywords: int = 1, stop_words: list = None, filters: list = None, batched: bool = True, use_maxsum: bool = False, nr_candidates: int = 20, use_mmr=True, diversity=0.7, **kwargs)#
Extract the keyphrases of a text field and output and store it into a separate field. This can be used to better explain sentiment, label and identify why certain things were clustered together!
- deduplicate(self, fields, amount_to_deduplicate: int = 100, filters: list = None)#
You can deduplicate values in your dataset here.
from relevanceai import Client client = Client() ds.deduplicate("text_field")
- extract_nouns(self, fields: list, output_fields: list, model_name: str = 'flair/chunk-english', cutoff_probability: float = 0.7, stopwords: list = None, filters: list = None, refresh: bool = False, chunksize: int = 50, **kwargs)#
Extract nouns to build a taxonomy
- view_workflow_history(self)#
View all previous workflows
from relevanceai import Client client = Client() ds = client.Dataset('sample') ds.view_workflow_history()
- translate(self, fields: list, model_id: str = None, output_fields: list = None, chunksize: int = 20, filters: list = None, refresh: bool = False)#
- extract_ner(self, fields: list, model_id: str = None, output_fields: list = None, chunksize: int = 20, filters: list = None, refresh: bool = False)#
Extract NER
- tag_text(self, fields: list, model_id: str = None, labels: list = None, output_fields: list = None, chunksize: int = 20, minimum_score: float = 0.2, maximum_number_of_labels: int = 5, filters: list = None, refresh: bool = False, **kwargs)#
Tag Text