relevanceai.dataset.read.read
#
All read operations for Dataset
Module Contents#
- relevanceai.dataset.read.read.update_nested_dictionary(d: dict, u: Union[dict, Mapping])#
“If the value is a dictionary, recursively call update_nested_dictionary on it, otherwise just set the value.”
The function takes two dictionaries as arguments, d and u. It iterates over the key-value pairs in u, and for each pair, it checks if the value is a dictionary. If it is, it calls update_nested_dictionary on the value and the corresponding value in d. If the value is not a dictionary, it just sets the value in d to the value in u
- Parameters
d (dict) – The dictionary to update
u (dict) – the dictionary
- Return type
A dictionary with the updated values.
- class relevanceai.dataset.read.read.Read(credentials: relevanceai.client.helpers.Credentials, dataset_id: str, fields: Optional[list] = None, image_fields: Optional[List[str]] = None, audio_fields: Optional[List[str]] = None, text_fields: Optional[List[str]] = None, highlight_fields: Optional[Dict[str, list]] = None, **kwargs)#
A Pandas Like datatset API for interacting with the RelevanceAI python package
- property shape(self)#
Returns the shape (N x C) of a dataset N = number of samples in the Dataset C = number of columns in the Dataset
- Returns
(N, C)
- Return type
Tuple
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) length, width = df.shape
- info(self, dtype_count: bool = False) pandas.DataFrame #
Return a dictionary that contains information about the Dataset including the index dtype and columns and non-null values.
- Parameters
dtype_count (bool) – If dtype_count is True, prints a value_counts of the data type
- Returns
a pandas dataframe of information
- Return type
pd.DataFrame
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) df.info()
- head(self, n: int = 5, raw_json: bool = False, select_fields: list = None, **kw) Union[dict, pandas.DataFrame] #
Return the first n rows. returns the first n rows of your dataset. It is useful for quickly testing if your object has the right type of data in it.
- Parameters
n (int, default 5) – Number of rows to select.
raw_json (bool) – If True, returns raw JSON and not Pandas Dataframe
kw – Additional arguments to feed into show_json
- Returns
The first ‘n’ rows of the caller object.
- Return type
Pandas DataFrame or Dict, depending on args
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset_id", image_fields=["image_url]) df.head()
- sample(self, n: int = 1, frac: float = None, filters: Optional[list] = None, random_state: int = 0, select_fields: Optional[list] = None, include_vector: bool = True, output_format: str = 'json')#
Return a random sample of items from a dataset.
- Parameters
n (int) – Number of items to return. Cannot be used with frac.
frac (float) – Fraction of items to return. Cannot be used with n.
filters (list) – Query for filtering the search results
random_state (int) – Random Seed for retrieving random documents.
select_fields (list) – Fields to include in the search results, empty array/list means all fields.
Example
from relevanceai import Client client = Client() df = client.Dataset("sample_dataset_id", image_fields=["image_url]) df.sample()
- get_all_documents(self, chunksize: int = 1000, filters: Optional[List] = None, sort: Optional[List] = None, select_fields: Optional[List] = None, include_vector: bool = True, show_progress_bar: bool = True)#
Retrieve all documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched. For more details see documents.get_where.
- Parameters
chunksize (list) – Number of documents to retrieve per retrieval
include_vector (bool) – Include vectors in the search results
sort (list) – Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones.
filters (list) – Query for filtering the search results
select_fields (list) – Fields to include in the search results, empty array/list means all fields.
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) documents = df.get_all_documents()
- get_documents_by_ids(self, document_ids: Union[List, str], include_vector: bool = True)#
Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field.
- Parameters
document_ids (Union[list, str]) – ID of a document in a dataset.
include_vector (bool) – Include vectors in the search results
Example
from relevanceai import Client, Dataset client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) df.get_documents_by_ids(["sample_id"], include_vector=False)
- get(self, document_ids: Union[List, str], include_vector: bool = True)#
Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field. This has the same functionality as get_document_by_ids.
- Parameters
document_ids (Union[list, str]) – ID of a document in a dataset.
include_vector (bool) – Include vectors in the search results
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) df.get(["sample_id"], include_vector=False)
- property schema(self) Dict #
Returns the schema of a dataset. Refer to datasets.create for different field types available in a Relevance schema.
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) df.schema
- property columns(self) List[str] #
Returns a list of columns
Example
from relevanceai import Client client = Client() dataset_id = "sample_dataset_id" df = client.Dataset(dataset_id) df.columns
- filter(self, index: Union[str, None] = None, items: Union[List, None] = None, like: Union[str, None] = None, regex: Union[str, None] = None, axis: Union[int, str] = 0)#
Returns a subset of the dataset, filtered by the parameters given
- Parameters
items (list-like) – the column on which to filter, if None then defaults to the _id column
items – Keep labels from axis which are in items.
like (str) – Keep labels from axis for which “like in label == True”.
regex (str (regular expression)) – Keep labels from axis for which re.search(regex, label) == True.
axis ({0 or index, 1 or columns},) – The axis on which to perform the search
- Return type
list of documents
Example
from relevanceai import Client client = Client() df = client.Dataset("ecommerce-example-encoded") filtered = df.filter(items=["product_title", "query", "product_price"]) filtered = df.filter(index="query", like="routers") filtered = df.filter(index="product_title", regex=".*Hard.*Drive.*")
- get_documents(self, number_of_documents: int = 20, filters: Optional[list] = None, cursor: str = None, chunksize: int = 1000, sort: Optional[list] = None, select_fields: Optional[list] = None, include_vector: bool = True, include_cursor: bool = False, after_id: Optional[list] = None, include_after_id: bool = True)#
Retrieve documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched.
If you are looking to combine your filters with multiple ORs, simply add the following inside the query {“strict”:”must_or”}. :param dataset_id: Unique name of dataset :type dataset_id: string :param number_of_documents: Number of documents to retrieve :type number_of_documents: int :param select_fields: Fields to include in the search results, empty array/list means all fields. :type select_fields: list :param cursor: Cursor to paginate the document retrieval :type cursor: string :param chunksize: Number of documents to retrieve per iteration :type chunksize: int :param include_vector: Include vectors in the search results :type include_vector: bool :param sort: Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones. :type sort: list :param filters: Query for filtering the search results :type filters: list
- get_metadata(self)#
Store Metadata
- property metadata(self)#
Get the metadata
- insert_metadata(self, metadata: dict)#
Insert metadata
- upsert_metadata(self, metadata: dict)#
Upsert metadata.
- chunk_dataset(self, select_fields: List = None, chunksize: int = 100, filters: list = None, after_id: list = None)#
Function for chunking a dataset
Example
from relevanceai import Client client = Client() ds = client.Dataset("sample") for c in ds.chunk_dataset( select_fields=["sample_label"], chunksize=100 ): # Returns a dictionary with 'cursor' and 'documents' keys docs = c['documents'] cursor = c['cursor'] for d in docs: d.update({"value": 3}) ds.upsert_documents(docs)
- list_vector_fields(self)#
Returns list of valid vector fields in dataset :param dataset_id: Unique name of dataset :type dataset_id: string
Example
from relevanceai import Client client = Client() ds = client.Dataset("_mock_dataset_") ds.list_vector_fields()
- abstract list_cluster_aliases(self)#
- isnull(self, show_progress_bar=False)#
- facets(self, fields: list, date_interval: str = 'monthly', page_size: int = 5, page: int = 1, asc: bool = False)#
Get a summary of fields - such as most common, their min/max, etc.
Example
from relevanceai import Client client = Client() from relevanceai.datasets import mock_documents documents = mock_documents(100) ds = client.Dataset("mock_documents") ds.upsert_documents(documents) ds.facets(["sample_1_value"])
- get_settings(self)#
Get the settings in dataset
- list_cluster_fields(self)#
- get_after_ids_for_workflows(self, num_of_workers: int = 3) List[Tuple[int, List[Optional[str]]]] #
Get multiple after IDs to run workflows in parallel
- num_of_workers: int = 3
The number of workers that we need to separate out the After IDs
- update_field_children(self, field: str, field_children: list, category: str, metadata: dict = None)#
Update the field children.
- list_field_children(self)#
List field children
- delete_field_children(self, fieldchildren_id: str)#
Delete field children