relevanceai.dataset.read.read#

All read operations for Dataset

Module Contents#

relevanceai.dataset.read.read.update_nested_dictionary(d: dict, u: Union[dict, Mapping])#

“If the value is a dictionary, recursively call update_nested_dictionary on it, otherwise just set the value.”

The function takes two dictionaries as arguments, d and u. It iterates over the key-value pairs in u, and for each pair, it checks if the value is a dictionary. If it is, it calls update_nested_dictionary on the value and the corresponding value in d. If the value is not a dictionary, it just sets the value in d to the value in u

Parameters
  • d (dict) – The dictionary to update

  • u (dict) – the dictionary

Return type

A dictionary with the updated values.

class relevanceai.dataset.read.read.Read(credentials: relevanceai.client.helpers.Credentials, dataset_id: str, fields: Optional[list] = None, image_fields: Optional[List[str]] = None, audio_fields: Optional[List[str]] = None, text_fields: Optional[List[str]] = None, highlight_fields: Optional[Dict[str, list]] = None, **kwargs)#

A Pandas Like datatset API for interacting with the RelevanceAI python package

property shape(self)#

Returns the shape (N x C) of a dataset N = number of samples in the Dataset C = number of columns in the Dataset

Returns

(N, C)

Return type

Tuple

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

length, width = df.shape
info(self, dtype_count: bool = False) pandas.DataFrame#

Return a dictionary that contains information about the Dataset including the index dtype and columns and non-null values.

Parameters

dtype_count (bool) – If dtype_count is True, prints a value_counts of the data type

Returns

a pandas dataframe of information

Return type

pd.DataFrame

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.info()
head(self, n: int = 5, raw_json: bool = False, select_fields: list = None, **kw) Union[dict, pandas.DataFrame]#

Return the first n rows. returns the first n rows of your dataset. It is useful for quickly testing if your object has the right type of data in it.

Parameters
  • n (int, default 5) – Number of rows to select.

  • raw_json (bool) – If True, returns raw JSON and not Pandas Dataframe

  • kw – Additional arguments to feed into show_json

Returns

The first ‘n’ rows of the caller object.

Return type

Pandas DataFrame or Dict, depending on args

Example

from relevanceai import Client

client = Client()

df = client.Dataset("sample_dataset_id", image_fields=["image_url])

df.head()
sample(self, n: int = 1, frac: float = None, filters: Optional[list] = None, random_state: int = 0, select_fields: Optional[list] = None, include_vector: bool = True, output_format: str = 'json')#

Return a random sample of items from a dataset.

Parameters
  • n (int) – Number of items to return. Cannot be used with frac.

  • frac (float) – Fraction of items to return. Cannot be used with n.

  • filters (list) – Query for filtering the search results

  • random_state (int) – Random Seed for retrieving random documents.

  • select_fields (list) – Fields to include in the search results, empty array/list means all fields.

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id", image_fields=["image_url])
df.sample()
get_all_documents(self, chunksize: int = 1000, filters: Optional[List] = None, sort: Optional[List] = None, select_fields: Optional[List] = None, include_vector: bool = True, show_progress_bar: bool = True)#

Retrieve all documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched. For more details see documents.get_where.

Parameters
  • chunksize (list) – Number of documents to retrieve per retrieval

  • include_vector (bool) – Include vectors in the search results

  • sort (list) – Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones.

  • filters (list) – Query for filtering the search results

  • select_fields (list) – Fields to include in the search results, empty array/list means all fields.

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
documents = df.get_all_documents()
get_documents_by_ids(self, document_ids: Union[List, str], include_vector: bool = True)#

Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field.

Parameters
  • document_ids (Union[list, str]) – ID of a document in a dataset.

  • include_vector (bool) – Include vectors in the search results

Example

from relevanceai import Client, Dataset
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.get_documents_by_ids(["sample_id"], include_vector=False)
get(self, document_ids: Union[List, str], include_vector: bool = True)#

Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field. This has the same functionality as get_document_by_ids.

Parameters
  • document_ids (Union[list, str]) – ID of a document in a dataset.

  • include_vector (bool) – Include vectors in the search results

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.get(["sample_id"], include_vector=False)
property schema(self) Dict#

Returns the schema of a dataset. Refer to datasets.create for different field types available in a Relevance schema.

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.schema
property columns(self) List[str]#

Returns a list of columns

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.columns
filter(self, index: Union[str, None] = None, items: Union[List, None] = None, like: Union[str, None] = None, regex: Union[str, None] = None, axis: Union[int, str] = 0)#

Returns a subset of the dataset, filtered by the parameters given

Parameters
  • items (list-like) – the column on which to filter, if None then defaults to the _id column

  • items – Keep labels from axis which are in items.

  • like (str) – Keep labels from axis for which “like in label == True”.

  • regex (str (regular expression)) – Keep labels from axis for which re.search(regex, label) == True.

  • axis ({0 or index, 1 or columns},) – The axis on which to perform the search

Return type

list of documents

Example

from relevanceai import Client
client = Client()
df = client.Dataset("ecommerce-example-encoded")
filtered = df.filter(items=["product_title", "query", "product_price"])
filtered = df.filter(index="query", like="routers")
filtered = df.filter(index="product_title", regex=".*Hard.*Drive.*")
get_documents(self, number_of_documents: int = 20, filters: Optional[list] = None, cursor: str = None, chunksize: int = 1000, sort: Optional[list] = None, select_fields: Optional[list] = None, include_vector: bool = True, include_cursor: bool = False, after_id: Optional[list] = None, include_after_id: bool = True)#

Retrieve documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched.

If you are looking to combine your filters with multiple ORs, simply add the following inside the query {“strict”:”must_or”}. :param dataset_id: Unique name of dataset :type dataset_id: string :param number_of_documents: Number of documents to retrieve :type number_of_documents: int :param select_fields: Fields to include in the search results, empty array/list means all fields. :type select_fields: list :param cursor: Cursor to paginate the document retrieval :type cursor: string :param chunksize: Number of documents to retrieve per iteration :type chunksize: int :param include_vector: Include vectors in the search results :type include_vector: bool :param sort: Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones. :type sort: list :param filters: Query for filtering the search results :type filters: list

get_metadata(self)#

Store Metadata

property metadata(self)#

Get the metadata

insert_metadata(self, metadata: dict)#

Insert metadata

upsert_metadata(self, metadata: dict)#

Upsert metadata.

chunk_dataset(self, select_fields: List = None, chunksize: int = 100, filters: list = None, after_id: list = None)#

Function for chunking a dataset

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("sample")
for c in ds.chunk_dataset(
    select_fields=["sample_label"],
    chunksize=100
):
    # Returns a dictionary with 'cursor' and 'documents' keys
    docs = c['documents']
    cursor = c['cursor']
    for d in docs:
        d.update({"value": 3})
    ds.upsert_documents(docs)
list_vector_fields(self)#

Returns list of valid vector fields in dataset :param dataset_id: Unique name of dataset :type dataset_id: string

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("_mock_dataset_")
ds.list_vector_fields()
abstract list_cluster_aliases(self)#
isnull(self, show_progress_bar=False)#
facets(self, fields: list, date_interval: str = 'monthly', page_size: int = 5, page: int = 1, asc: bool = False)#

Get a summary of fields - such as most common, their min/max, etc.

Example

from relevanceai import Client
client = Client()
from relevanceai.datasets import mock_documents
documents = mock_documents(100)
ds = client.Dataset("mock_documents")
ds.upsert_documents(documents)
ds.facets(["sample_1_value"])
get_settings(self)#

Get the settings in dataset

list_cluster_fields(self)#
get_after_ids_for_workflows(self, num_of_workers: int = 3) List[Tuple[int, List[Optional[str]]]]#

Get multiple after IDs to run workflows in parallel

num_of_workers: int = 3

The number of workers that we need to separate out the After IDs

update_field_children(self, field: str, field_children: list, category: str, metadata: dict = None)#

Update the field children.

list_field_children(self)#

List field children

delete_field_children(self, fieldchildren_id: str)#

Delete field children