relevanceai.operations.labels.labels
#
Pandas like dataset API
Module Contents#
- class relevanceai.operations.labels.labels.LabelOps(*args, **kwargs)#
Base class for operations
- label_vector(self, vector, alias: str, label_dataset_id: str, label_vector_field: str, label_fields: list, number_of_labels: int = 1, similarity_metric: relevanceai.operations.cluster.constants.NEAREST_NEIGHBOURS = 'cosine', score_field: str = '_search_score', **kwargs)#
Label a dataset based on a model.
Warning
This function is currently in beta and is likely to change in the future. We recommend not using this in any production systems.
Note
New in v0.32.0
- Parameters
vector_fields (list) – The list of vector field
label_dataset_id (str) – The dataset to label with
alias (str) – The alias of the labels (for example - “ranking_labels”)
label_dataset_id – The dataset to use fo rlabelling
label_vector_field (str) – The vector field of the label dataset
label_fields (list) – The label field of the dataset to use
number_of_labels (int) – The numebr of labels to get
similarity_metric (str) – The similarity metric to adopt
score_field (str) – The field to use for scoring
Example
from relevanceai import Client from relevanceai.ops.clusterops.cluster import ClusterOps from relevanceai.ops.clusterops.kmeans_clusterer import KMeansModel client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) result = df.label_vector( [...], label_vector_field="sample_1_vector_", alias="alias_sample", label_dataset_id=label_dataset_id_id, label_fields=["sample_1_label"], number_of_labels=1, )
- store_labels_in_document(self, labels: list, alias: str)#
- label_document(self, document: dict, vector_field: str, vector: List[float], alias: str, label_dataset_id: str, label_vector_field: str, label_fields: List[str], number_of_labels: int = 1, similarity_metric='cosine', score_field: str = '_search_score')#
Label a dataset based on a model.
Warning
This function is currently in beta and is likely to change in the future. We recommend not using this in any production systems.
Note
New in v0.32.0
- Parameters
document (dict) – A document to label
vector_field (str) – The list of vector field
label_dataset_id (str) – The dataset to label with
alias (str) – The alias of the labels (for example - “ranking_labels”)
label_dataset_id – The dataset to use fo rlabelling
label_vector_field (str) – The vector field of the label dataset
label_fields (list) – The label field of the dataset to use
number_of_labels (int) – The numebr of labels to get
similarity_metric (str) – The similarity metric to adopt
score_field (str) – The field to use for scoring
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset_id") results = df.label_document( document={...}, vector_field="sample_1_vector_", alias="example", label_dataset_id=label_dataset_id_id, label_fields=["sample_1_label"], label_vector_field="sample_1_vector_", filters=[ { "field": "sample_1_label", "filter_type": "exists", "condition": ">=", "condition_value": " ", }, ], )
- label_from_dataset(self, vector_field: str, alias: str, label_dataset_id: str, label_vector_field: str, label_fields: List[str], number_of_labels: int = 1, filters: Optional[list] = None, similarity_metric='cosine', score_field: str = '_search_score')#
Label a dataset based on a model.
Warning
This function is currently in beta and is likely to change in the future. We recommend not using this in any production systems.
Note
New in v0.32.0
- Parameters
vector_field (str) – The vector field to match with
alias (str) – The alias of the labels (for example - “ranking_labels”)
label_dataset_id (str) – The dataset to use fo rlabelling
label_vector_field (str) – The vector field of the label dataset
label_fields (list) – The label field of the dataset to use
filters (list) – The filters to apply to label
number_of_labels (int) – The numebr of labels to get
similarity_metric (str) – The similarity metric to adopt
score_field (str) – The field to use for scoring
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset_id") results = df.label( vector_field="sample_1_vector_", alias="example", label_dataset_id=label_dataset_id, label_fields=["sample_1_label"], label_vector_field="sample_1_vector_", filters=[ { "field": "sample_1_label", "filter_type": "exists", "condition": ">=", "condition_value": " ", }, ], )
- label_from_list(self, vector_field: str, model: Callable, label_list: list, similarity_metric='cosine', number_of_labels: int = 1, score_field: str = '_search_score', alias: Optional[str] = None)#
Label from a given list.
- Parameters
vector_field (str) – The vector field to label in the original dataset
model (Callable) – This will take a list of strings and then encode them
label_list (List) – A list of labels to accept
similarity_metric (str) – The similarity metric to accept
number_of_labels (int) – The number of labels to accept
score_field (str) – What to call the scoring of the labels
alias (str) – The alias of the labels
Example
from relevanceai import Client client = Client() df = client.Dataset("sample") # Get a model to help us encode from vectorhub.encoders.text.tfhub import USE2Vec enc = USE2Vec() # Use that model to help with encoding label_list = ["dog", "cat"] df = client.Dataset("_github_repo_vectorai") df.label_from_list("documentation_vector_", enc.bulk_encode, label_list, alias="pets")
- clean_html(self, html)#
Cleans HTML from text
- get_word_count(self, text_fields: List[str])#
Create labels from a given text field.
- Parameters
text_fields (list) – List of text fields
Example
from relevanceai import Client client = Client() df = client.Dataset("sample") df.get_word_count()
- generate_text_list_from_documents(self, documents: Optional[list] = None, text_fields: Optional[list] = None, clean_html: bool = False)#
Generate a list of text from documents to feed into the counter model. :param documents: A list of documents :type documents: list :param fields: A list of fields :type fields: list :param clean_html: If True, also cleans the text in a given text document to remove HTML. Will be slower
if processing on a large document
- generate_text_list(self, filters: Optional[list] = None, chunksize: int = 20, text_fields: Optional[list] = None, cursor: str = None)#
- get_ngrams(self, text, n: int = 2, stopwords_dict: str = 'english', additional_stopwords: Optional[list] = None, min_word_length: int = 2, preprocess_hooks: Optional[list] = None)#
- keyphrases(self, text_fields: list, algorithm: str = 'rake', n: int = 2, most_common: int = 10, filters: Optional[list] = None, additional_stopwords: Optional[list] = None, min_word_length: int = 2, chunksize: int = 1000, document_limit: int = None, preprocess_hooks: Optional[List[callable]] = None, verbose: bool = True) list #
Returns the most common phrase in the following format:
[('heavily draping faux fur', 16.0), ('printed sweatshirt fabric made', 14.333333333333334), ('high paper bag waist', 14.25), ('ribbed organic cotton jersey', 13.803030303030303), ('soft sweatshirt fabric', 9.0), ('open back pocket', 8.5), ('layered tulle skirt', 8.166666666666666), ('soft brushed inside', 8.0), ('discreet side pockets', 7.5), ('cotton blend', 5.363636363636363)]
- Parameters
text_fields (list) – A list of text fields
algorithm (str) – The algorithm to use. Must be one of nltk or rake.
n (int) – if algorithm is nltk, this will set the number of words. If rake, then it will do nothing.
most_common (int) – How many to return
filters (list) – A list of filters to supply
additional_stopwords (list) – A list of additional stopwords to supply
min_word_length (int) – The minimum word length to apply to clean. This can be helpful if there are common acronyms that you want to exclude.
chunksize (int) – Batch size is the number of documents to retrieve in a chunk
document_limit (int) – The maximum number of documents in a dataset
preprocess_hooks (List[Callable]) – A list of process hooks to clean text before they count as a word
Example
from relevanceai import Client client = Client() ds = client.Dataset("sample") # Returns the top keywords in a text field ds.keyphrases(text_fields=["sample"]) # Create an e-commerce dataset from relevanceai.package_utils.datasets import get_dummy_ecommerce_dataset docs = get_dummy_ecommerce_dataset() ds = client.Dataset("ecommerce-example") ds.upsert_documents(docs) ds.keyphrases(text_fields=text_fields, algorithm="nltk", n=3) def remove_apostrophe(string): return string.replace("'s", "") ds.keyphrases(text_fields=text_fields, algorithm="nltk", n=3, preprocess_hooks=[remove_apostrophe]) ds.keyphrases(text_fields=text_fields, algorithm="nltk", n=3, additional_stopwords=["Men", "Women"])
- cluster_keyphrases(self, text_fields: List[str], vector_fields: Optional[List[str]] = None, cluster_alias: Optional[str] = None, cluster_field: str = '_cluster_', num_clusters: int = 100, most_common: int = 10, preprocess_hooks: Optional[List[callable]] = None, algorithm: str = 'rake', n: int = 2, deployable_id: Optional[str] = None, dataset_id: Optional[str] = None)#
Simple implementation of the cluster keyphrases.
Example
from relevanceai import Client client = Client() ds = client.Dataset("sample") from relevanceai.operations.labels import LabelOps label_ops = LabelOps.from_dataset(ds) label_ops.cluster_keyphrases( text_fields=["sample_text"] )
- Parameters
vector_fields (list) – The list of vector fields
text_fields (list) – The list of text fields
cluster_alias (str) – The alias of the cluster
cluster_field (str) – The cluster field to try things on
num_clusters (int) – The number of clusters
preprocess_hooks (list) – The preprocess hooks
algorithm (str) – The algorithm to use
n (int) – The number of words
- label_from_common_words(self, text_field: str, model: Callable = None, most_common: int = 1000, n_gram: int = 1, temp_vector_field: str = '_label_vector_', labels_fn='labels.txt', stopwords: Optional[list] = None, algorithm: str = 'nltk')#
Label by the most popular keywords.
Algorithm:
Get top X keywords or bigram for a text field
Default X to 1000 or something scaled towards number of documents
Vectorize those into keywords
Label every document with those top keywords
Note
New in v1.1.0
- Parameters
text_fields (str) – The field to label
model (Callable) – The function or callable to turn text into a vector.
most_common (int) – How many of the most common worsd do you want to use as labels
n_gram (int) – How many word co-occurrences do you want to consider
temp_vector_field (str) – The temporary vector field name
labels_fn (str) – The filename for labels to be saved in.
stopwords (list) – A list of stopwords
algorithm (str) – The algorithm to use. Must be one of nltk or rake.
Example
import random from relevanceai import Client from relevanceai.package_utils.datasets import mock_documents from relevanceai.package_utils.logger import FileLogger client = Client() ds = client.Dataset("sample") documents = mock_documents() ds.insert_documents(documents) def encode(): return [random.randint(0, 100) for _ in range(5)] ds.label_from_common_words( text_field="sample_1_label", model=encode, most_common=10, n_gram=1 )