Dataset#

Dataset is the class that Relevance AI uses to resolve a lot of complexity.

It is instantiated like this:

from relevanceai import Client
client = Client()
ds = client.Dataset("sample_dataset_id")
ds.head()
class relevanceai.dataset.dataset.Dataset#
is_empty()#

Check if a dataset is empty.

label_clusters(cluster_labels, alias, vector_fields)#

Label your clusters programatiically

Example

ds.label_clusters(
    {"cluster_1" : "nice reviews"},
    alias=...,
    vector_fields=...
)
update_alias(field, alias)#

Update the alias of a field using the SDK.

All read operations for Dataset

class relevanceai.dataset.read.read.Read#

A Pandas Like datatset API for interacting with the RelevanceAI python package

chunk_dataset(select_fields=None, chunksize=100, filters=None, after_id=None)#

Function for chunking a dataset

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("sample")
for c in ds.chunk_dataset(
    select_fields=["sample_label"],
    chunksize=100
):
    # Returns a dictionary with 'cursor' and 'documents' keys
    docs = c['documents']
    cursor = c['cursor']
    for d in docs:
        d.update({"value": 3})
    ds.upsert_documents(docs)
property columns: List[str]#

Returns a list of columns

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.columns
Return type

List[str]

delete_field_children(fieldchildren_id)#

Delete field children

facets(fields, date_interval='monthly', page_size=5, page=1, asc=False)#

Get a summary of fields - such as most common, their min/max, etc.

Example

from relevanceai import Client
client = Client()
from relevanceai.datasets import mock_documents
documents = mock_documents(100)
ds = client.Dataset("mock_documents")
ds.upsert_documents(documents)
ds.facets(["sample_1_value"])
filter(index=None, items=None, like=None, regex=None, axis=0)#

Returns a subset of the dataset, filtered by the parameters given

Parameters
  • items (list-like) – the column on which to filter, if None then defaults to the _id column

  • items – Keep labels from axis which are in items.

  • like (str) – Keep labels from axis for which “like in label == True”.

  • regex (str (regular expression)) – Keep labels from axis for which re.search(regex, label) == True.

  • axis ({0 or index, 1 or columns},) – The axis on which to perform the search

Return type

list of documents

Example

from relevanceai import Client
client = Client()
df = client.Dataset("ecommerce-example-encoded")
filtered = df.filter(items=["product_title", "query", "product_price"])
filtered = df.filter(index="query", like="routers")
filtered = df.filter(index="product_title", regex=".*Hard.*Drive.*")
get(document_ids, include_vector=True)#

Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field. This has the same functionality as get_document_by_ids.

Parameters
  • document_ids (Union[list, str]) – ID of a document in a dataset.

  • include_vector (bool) – Include vectors in the search results

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.get(["sample_id"], include_vector=False)
get_after_ids_for_workflows(num_of_workers=3)#

Get multiple after IDs to run workflows in parallel

num_of_workers: int = 3

The number of workers that we need to separate out the After IDs

Return type

List[Tuple[int, List[Optional[str]]]]

get_all_documents(chunksize=1000, filters=None, sort=None, select_fields=None, include_vector=True, show_progress_bar=True)#

Retrieve all documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched. For more details see documents.get_where.

Parameters
  • chunksize (list) – Number of documents to retrieve per retrieval

  • include_vector (bool) – Include vectors in the search results

  • sort (list) – Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones.

  • filters (list) – Query for filtering the search results

  • select_fields (list) – Fields to include in the search results, empty array/list means all fields.

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
documents = df.get_all_documents()
get_documents(number_of_documents=20, filters=None, cursor=None, chunksize=1000, sort=None, select_fields=None, include_vector=True, include_cursor=False, after_id=None, include_after_id=True)#

Retrieve documents with filters. Filter is used to retrieve documents that match the conditions set in a filter query. This is used in advance search to filter the documents that are searched.

If you are looking to combine your filters with multiple ORs, simply add the following inside the query {“strict”:”must_or”}. :param dataset_id: Unique name of dataset :type dataset_id: string :type number_of_documents: int :param number_of_documents: Number of documents to retrieve :type number_of_documents: int :type select_fields: Optional[list] :param select_fields: Fields to include in the search results, empty array/list means all fields. :type select_fields: list :type cursor: Optional[str] :param cursor: Cursor to paginate the document retrieval :type cursor: string :type chunksize: int :param chunksize: Number of documents to retrieve per iteration :type chunksize: int :type include_vector: bool :param include_vector: Include vectors in the search results :type include_vector: bool :type sort: Optional[list] :param sort: Fields to sort by. For each field, sort by descending or ascending. If you are using descending by datetime, it will get the most recent ones. :type sort: list :type filters: Optional[list] :param filters: Query for filtering the search results :type filters: list

get_documents_by_ids(document_ids, include_vector=True)#

Retrieve a document by its ID (“_id” field). This will retrieve the document faster than a filter applied on the “_id” field.

Parameters
  • document_ids (Union[list, str]) – ID of a document in a dataset.

  • include_vector (bool) – Include vectors in the search results

Example

from relevanceai import Client, Dataset
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.get_documents_by_ids(["sample_id"], include_vector=False)
get_metadata()#

Store Metadata

get_settings()#

Get the settings in dataset

head(n=5, raw_json=False, select_fields=None, **kw)#

Return the first n rows. returns the first n rows of your dataset. It is useful for quickly testing if your object has the right type of data in it.

Parameters
  • n (int, default 5) – Number of rows to select.

  • raw_json (bool) – If True, returns raw JSON and not Pandas Dataframe

  • kw – Additional arguments to feed into show_json

Returns

The first ‘n’ rows of the caller object.

Return type

Pandas DataFrame or Dict, depending on args

Example

from relevanceai import Client

client = Client()

df = client.Dataset("sample_dataset_id", image_fields=["image_url])

df.head()
info(dtype_count=False)#

Return a dictionary that contains information about the Dataset including the index dtype and columns and non-null values.

Parameters

dtype_count (bool) – If dtype_count is True, prints a value_counts of the data type

Returns

a pandas dataframe of information

Return type

pd.DataFrame

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.info()
insert_metadata(metadata)#

Insert metadata

list_field_children()#

List field children

list_vector_fields()#

Returns list of valid vector fields in dataset :param dataset_id: Unique name of dataset :type dataset_id: string

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("_mock_dataset_")
ds.list_vector_fields()
property metadata#

Get the metadata

sample(n=1, frac=None, filters=None, random_state=0, select_fields=None, include_vector=True, output_format='json')#

Return a random sample of items from a dataset.

Parameters
  • n (int) – Number of items to return. Cannot be used with frac.

  • frac (float) – Fraction of items to return. Cannot be used with n.

  • filters (list) – Query for filtering the search results

  • random_state (int) – Random Seed for retrieving random documents.

  • select_fields (list) – Fields to include in the search results, empty array/list means all fields.

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id", image_fields=["image_url])
df.sample()
property schema: Dict#

Returns the schema of a dataset. Refer to datasets.create for different field types available in a Relevance schema.

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.schema
Return type

Dict

property shape#

Returns the shape (N x C) of a dataset N = number of samples in the Dataset C = number of columns in the Dataset

Returns

(N, C)

Return type

Tuple

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

length, width = df.shape
update_field_children(field, field_children, category, metadata=None)#

Update the field children.

upsert_metadata(metadata)#

Upsert metadata.

relevanceai.dataset.read.read.update_nested_dictionary(d, u)#

“If the value is a dictionary, recursively call update_nested_dictionary on it, otherwise just set the value.”

The function takes two dictionaries as arguments, d and u. It iterates over the key-value pairs in u, and for each pair, it checks if the value is a dictionary. If it is, it calls update_nested_dictionary on the value and the corresponding value in d. If the value is not a dictionary, it just sets the value in d to the value in u

Parameters
  • d (dict) – The dictionary to update

  • u (dict) – the dictionary

Return type

A dictionary with the updated values.

Pandas like dataset API

class relevanceai.dataset.write.write.Write#
apply(func, retrieve_chunksize=100, filters=None, select_fields=None, show_progress_bar=True, use_json_encoder=True, axis=0, log_to_file=True, log_file=None, **apply_args)#

Apply a function along an axis of the DataFrame.

Objects passed to the function are Series objects whose index is either the DataFrame’s index (axis=0) or the DataFrame’s columns (axis=1). By default (result_type=None), the final return type is inferred from the return type of the applied function. Otherwise, it depends on the result_type argument.

Parameters
  • func (function) – Function to apply to each document

  • retrieve_chunksize (int) – The number of documents that are received from the original collection with each loop iteration.

  • max_workers (int) – The number of processors you want to parallelize with

  • max_error (int) – How many failed uploads before the function breaks

  • json_encoder (bool) – Whether to automatically convert documents to json encodable format

  • axis (int) – Axis along which the function is applied. - 9 or ‘index’: apply function to each column - 1 or ‘columns’: apply function to each row

Example

from relevanceai import Client
from relevanceai.package_utils.datasets import mock_documents

client = Client()

ds = client.Dataset("sample_dataset_id")
ds.upsert_documents(mock_documents(100))

def update_doc(doc):
    doc["value"] = 2
    return doc

df.apply(update_doc)

def update_doc_wargs(doc, value1, value2):
    doc["value"] += value1
    doc["value"] *= value2
    return doc

df.apply(func=update_doc, value1=3, value2=2)
batched_upsert_media(images, show_progress_bar=False, n_workers=None)#

It takes a list of images, splits it into batches, and then uses a thread pool to upsert the images in parallel

Parameters
  • images (List[str]) – A list of media src paths to upload

  • show_progress_bar (bool) – Show the progress bar

  • max_workers (Optional[int]) – The number of workers to use. If None, this is set to the max number in ThreadPoolExecutor

Returns

List[str]

Return type

A list of media_urls

bulk_apply(bulk_func, bulk_func_args=None, bulk_func_kwargs=None, chunksize=None, filters=None, select_fields=None, transform_workers=2, push_workers=2, timeout=None, buffer_size=0, show_progress_bar=True, transform_chunksize=32, multithreaded_update=True, ingest_in_background=True, **kwargs)#

Apply a bulk function along an axis of the DataFrame.

Parameters
  • bulk_func (function) – Function to apply to a bunch of documents at a time

  • retrieve_chunksize (int) – The number of documents that are received from the original collection with each loop iteration.

  • max_workers (int) – The number of processors you want to parallelize with

  • max_error (int) – How many failed uploads before the function breaks

  • json_encoder (bool) – Whether to automatically convert documents to json encodable format

  • axis (int) – Axis along which the function is applied. - 9 or ‘index’: apply function to each column - 1 or ‘columns’: apply function to each row

Example

from relevanceai import Client

client = Client()

df = client.Dataset("sample_dataset_id")

def update_documents(documents):
    for d in documents:
        d["value"] = 10
    return documents

df.apply(update_documents)
cat(vector_name=None, fields=None)#

Concatenates numerical fields along an axis and reuploads this vector for other operations

Parameters
  • vector_name (str, default None) – name of the new concatenated vector field

  • fields (List) – fields alone which the new vector will concatenate

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

fields = [
    "numeric_field1",
    "numeric_field2",
    "numeric_field3"
]

df.concat(fields)

concat_vector_field_name = "concat_vector_"
df.concat(vector_name=concat_vector_field_name, fields=fields)
concat(vector_name=None, fields=None)#

Concatenates numerical fields along an axis and reuploads this vector for other operations

Parameters
  • vector_name (str, default None) – name of the new concatenated vector field

  • fields (List) – fields alone which the new vector will concatenate

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

fields = [
    "numeric_field1",
    "numeric_field2",
    "numeric_field3"
]

df.concat(fields)

concat_vector_field_name = "concat_vector_"
df.concat(vector_name=concat_vector_field_name, fields=fields)
create(schema=None)#

A dataset can store documents to be searched, retrieved, filtered and aggregated (similar to Collections in MongoDB, Tables in SQL, Indexes in ElasticSearch). A powerful and core feature of Relevance is that you can store both your metadata and vectors in the same document. When specifying the schema of a dataset and inserting your own vector use the suffix (ends with) “_vector_” for the field name, and specify the length of the vector in dataset_schema.

For example:

These are the field types supported in our datasets: [“text”, “numeric”, “date”, “dict”, “chunks”, “vector”, “chunkvector”].

For example:

{
    "product_text_description" : "text",
    "price" : "numeric",
    "created_date" : "date",
    "product_texts_chunk_": "chunks",
    "product_text_chunkvector_" : 1024
}

You don’t have to specify the schema of every single field when creating a dataset, as Relevance will automatically detect the appropriate data type for each field (vectors will be automatically identified by its “_vector_” suffix). Infact you also don’t always have to use this endpoint to create a dataset as /datasets/bulk_insert will infer and create the dataset and schema as you insert new documents.

Note

  • A dataset name/id can only contain undercase letters, dash, underscore and numbers.

  • “_id” is reserved as the key and id of a document.

  • Once a schema is set for a dataset it cannot be altered. If it has to be altered, utlise the copy dataset endpoint.

For more information about vectors check out the ‘Vectorizing’ section, services.search.vector or out blog at https://tryrelevance.com/blog. For more information about chunks and chunk vectors check out datasets.search.chunk.

Parameters

schema (dict) – Schema for specifying the field that are vectors and its length

Example

from relevanceai import Client
client = Client()

documents = [
    {
        "_id": "321",
        "value": 10
    },
    {
        "_id": "4243",
        "value": 100
    }
]

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.create()

df.insert_documents(documents)
Return type

Dict

delete()#

Delete a dataset

Example

from relevanceai import Client
client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
df.delete()
delete_documents(document_ids)#

Delete documents in a dataset

Parameters

document_ids (List[str]) – A list of document IDs to delete

get_media_documents(media_fns, verbose=False, file_log='media_upload.log', logging=True)#

Bulk insert medias. Returns a link to once it has been hosted

Parameters
  • media_fns (List[str]) – List of medias to upload

  • verbose (bool) – If True, prints statements after uploading

  • file_log (str) – The file log to write

Return type

dict

host_media_documents(media_fns, verbose=False, file_log='media_upload.log', logging=True)#

Bulk insert medias. Returns a link to once it has been hosted

Parameters
  • media_fns (List[str]) – List of medias to upload

  • verbose (bool) – If True, prints statements after uploading

  • file_log (str) – The file log to write

Return type

dict

insert_csv(filepath_or_buffer, chunksize=10000, max_workers=2, retry_chunk_mult=0.5, show_progress_bar=False, index_col=None, csv_args=None, col_for_id=None, auto_generate_id=True)#

Insert data from csv file

Parameters
  • filepath_or_buffer – Any valid string path is acceptable. The string could be a URL. Valid URL schemes include http, ftp, s3, gs, and file.

  • chunksize (int) – Number of lines to read from csv per iteration

  • max_workers (int) – Number of workers active for multi-threading

  • retry_chunk_mult (int) – Multiplier to apply to chunksize if upload fails

  • csv_args (dict) – Optional arguments to use when reading in csv. For more info, see https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

  • index_col (None) – Optional argument to specify if there is an index column to be skipped (e.g. index_col = 0)

  • col_for_id (str) – Optional argument to use when a specific field is supposed to be used as the unique identifier (‘_id’)

  • auto_generate_id (bool = True) – Automatically generateds UUID if auto_generate_id is True and if the ‘_id’ field does not exist

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id")

csv_filename = "temp.csv"
df.insert_csv(csv_filename)
Return type

Dict

insert_df(df, col_for_id=None, *args, **kwargs)#

Insert a dataframe into the dataset. Takes additional args and kwargs based on insert_documents.

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id")
pandas_df = pd.DataFrame({"value": [3, 2, 1], "_id": ["10", "11", "12"]})
df.insert_pandas_dataframe(pandas_df)
insert_documents(documents, max_workers=None, media_workers=None, show_progress_bar=True, chunksize=None, overwrite=True, ingest_in_background=True, media_fields=None)#

Insert a list of documents with multi-threading automatically enabled.

  • When inserting the document you can optionally specify your own id for a document by using the field name “_id”, if not specified a random id is assigned.

  • When inserting or specifying vectors in a document use the suffix (ends with) “_vector_” for the field name. e.g. “product_description_vector_”.

  • When inserting or specifying chunks in a document the suffix (ends with) “_chunk_” for the field name. e.g. “products_chunk_”.

  • When inserting or specifying chunk vectors in a document’s chunks use the suffix (ends with) “_chunkvector_” for the field name. e.g. “products_chunk_.product_description_chunkvector_”.

Documentation can be found here: https://ingest-api-dev-aueast.tryrelevance.com/latest/documentation#operation/InsertEncode

Parameters
  • documents (list) – A list of documents. Document is a JSON-like data that we store our metadata and vectors with. For specifying id of the document use the field ‘_id’, for specifying vector field use the suffix of ‘_vector_’

  • bulk_fn (callable) – Function to apply to documents before uploading

  • max_workers (int) – Number of workers active for multi-threading

  • retry_chunk_mult (int) – Multiplier to apply to chunksize if upload fails

  • chunksize (int) – Number of documents to upload per worker. If None, it will default to the size specified in config.upload.target_chunk_mb

  • use_json_encoder (bool) – Whether to automatically convert documents to json encodable format

  • media_fields (List[str]) – specifies which fields are local medias and need to upserted to S3. These should be given in absolute path format

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

documents = [
    {
        "_id": "10",
        "value": 5
    },
    {
        "_id": "332",
        "value": 10
    }
]

df.insert_documents(documents)
Return type

Dict

insert_list(labels, label_field='label', **kwargs)#

It takes a list of labels, and inserts them into the database as documents

Parameters
  • labels (list) – list of labels to insert

  • label_field (str, optional) – The field in the document that contains the label.

Return type

A list of the ids of the documents that were inserted.

insert_local_media(media_fn, verbose=True)#

Insert local media

Parameters
  • media_fn (str) – A local media to upload

  • verbose (bool) – If True, prints a statement after uploading each media

insert_local_medias(media_fns, verbose=False, file_log='local_media_upload.log', logging=True)#

Insert a list of local medias.

Parameters
  • media_fns (List[str]) – A list of local medias

  • verbose (bool) – If True, this will print after each successful upload.

  • file_log (str) – The log to write

insert_media_bytes(bytes, filename, verbose=True)#

Insert a single media URL

insert_media_folder(path, field='medias', recurse=True, *args, **kwargs)#

Given a path to a directory, this method loads all media-related files into a Dataset.

Parameters
  • field (str) – A text field of a dataset.

  • path (Union[Path, str]) – The path to the directory containing medias.

  • recurse (bool) – Indicator that determines whether to recursively insert medias from subdirectories in the directory.

Return type

dict

Example

from relevanceai import Client
client = Client()
ds = client.Dataset("dataset_id")

from pathlib import Path
path = Path("medias/")
# list(path.iterdir()) returns
# [
#    PosixPath('media.jpg'),
#    PosixPath('more-medias'), # a directory
# ]

get_all_medias: bool = True
if get_all_medias:
    # Inserts all medias, even those in the more-medias directory
    ds.insert_media_folder(
        field="medias", path=path, recurse=True
    )
else:
    # Only inserts media.jpg
    ds.insert_media_folder(
        field="medias", path=path, recurse=False
    )
insert_media_url(media_url, verbose=True)#

Insert a single media URL

insert_media_urls(media_urls, verbose=True, file_log='insert_media_urls.log', logging=True)#

Insert a single media URL

insert_pandas_dataframe(df, col_for_id=None, *args, **kwargs)#

Insert a dataframe into the dataset. Takes additional args and kwargs based on insert_documents.

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id")
pandas_df = pd.DataFrame({"value": [3, 2, 1], "_id": ["10", "11", "12"]})
df.insert_pandas_dataframe(pandas_df)
update_where(update, filters)#

Updates documents by filters. The updates to make to the documents that is returned by a filter.

For more information about filters refer to datasets.documents.get_where.

Example

from relevanceai import Client
client = Client()
ds = client.Dataset()
ds.update_where(
    {"value": 3},
    filters=ds['value'] != 10 # apply a simple filter
)
upsert_documents(documents, max_workers=2, media_workers=None, show_progress_bar=False, chunksize=None, ingest_in_background=True, media_fields=None)#

Update a list of documents with multi-threading automatically enabled. Edits documents by providing a key value pair of fields you are adding or changing, make sure to include the “_id” in the documents.

Parameters
  • documents (list) – A list of documents. Document is a JSON-like data that we store our metadata and vectors with. For specifying id of the document use the field ‘_id’, for specifying vector field use the suffix of ‘_vector_’

  • bulk_fn (callable) – Function to apply to documents before uploading

  • max_workers (int) – Number of workers active for multi-threading

  • retry_chunk_mult (int) – Multiplier to apply to chunksize if upload fails

  • chunksize (int) – Number of documents to upload per worker. If None, it will default to the size specified in config.upload.target_chunk_mb

  • use_json_encoder (bool) – Whether to automatically convert documents to json encodable format

  • create_id (bool) – If True, creates ID for users automatically

Example

from relevanceai import Client

client = Client()

documents = [
    {
        "_id": "321",
        "value": 10
    },
    {
        "_id": "4243",
        "value": 100
    }
]

dataset_id = "sample_dataset_id"
ds = client.Dataset(dataset_id)
ds.upsert_documents(documents)
Return type

Dict

upsert_media(media_fns, verbose=False, file_log='media_upload.log', logging=True, **kw)#

Insert medias into a dataset.

Parameters
  • media_fns (List[str]) – A list of medias to upsert

  • verbose (bool) – If True, prints statements after uploading

  • file_log (str) – The file log to write

class relevanceai.dataset.io.export.Export#

Exports

__new__(**kwargs)#
to_csv(filename, **kwargs)#

Download a dataset from Relevance AI to a local .csv file

Parameters
  • filename (str) – path to downloaded .csv file

  • kwargs (Optional) – see client.get_all_documents() for extra args

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

csv_fname = "path/to/csv/file.csv"
df.to_csv(csv_fname)
to_dict(orient='records', **kwargs)#

Returns the raw list of dicts from Relevance AI

Parameters

None

Return type

list of documents in dictionary format

Example

from relevanceai import Client

client = Client()

dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)

dict = df.to_dict(orient="records")
to_pandas_dataframe(**kwargs)#

Note

This function was introduced in 1.1.5.

Return type

DataFrame

class relevanceai.dataset.read.metadata.Metadata#

Metadata object

insert(metadata, verbose=False)#

Insert metadata

upsert(metadata, verbose=False)#

Upsert metadata.

class relevanceai.dataset.read.statistics.Statistics#
aggregate(groupby=None, metrics=None, filters=None, page_size=20, page=1, asc=False, aggregation_query=None, sort=None)#

Aggregation/Groupby of a collection using an aggregation query. The aggregation query is a json body that follows the schema of:

Example

{
    "groupby" : [
        {"name": <alias>, "field": <field in the collection>, "agg": "category"},
        {"name": <alias>, "field": <another groupby field in the collection>, "agg": "numeric"}
    ],
    "metrics" : [
        {"name": <alias>, "field": <numeric field in the collection>, "agg": "avg"}
        {"name": <alias>, "field": <another numeric field in the collection>, "agg": "max"}
    ]
}
For example, one can use the following aggregations to group score based on region and player name.
{
    "groupby" : [
        {"name": "region", "field": "player_region", "agg": "category"},
        {"name": "player_name", "field": "name", "agg": "category"}
    ],
    "metrics" : [
        {"name": "average_score", "field": "final_score", "agg": "avg"},
        {"name": "max_score", "field": "final_score", "agg": "max"},
        {'name':'total_score','field':"final_score", 'agg':'sum'},
        {'name':'average_deaths','field':"final_deaths", 'agg':'avg'},
        {'name':'highest_deaths','field':"final_deaths", 'agg':'max'},
    ]
}
corr(X, Y, vector_field, alias, groupby=None, fontsize=16, show_plot=True)#

Note

This function was introduced in 1.2.2.

describe(return_type='pandas')#

Descriptive statistics include those that summarize the central tendency dispersion and shape of a dataset’s distribution, excluding NaN values.

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
field = "sample_field"
df.describe() # returns pandas dataframe of stats
df.describe(return_type='dict') # return raw json stats
Return type

dict

facets(fields=[], date_interval='monthly', page_size=5, page=1, asc=False)#

Get a summary of fields - such as most common, their min/max, etc.

Example

from relevanceai import Client
client = Client()
from relevanceai.datasets import mock_documents
documents = mock_documents(100)
ds = client.Dataset("mock_documents")
ds.upsert_documents(documents)
ds.facets(["sample_1_value"])
health(output_format='dataframe')#

Gives you a summary of the health of your vectors, e.g. how many documents with vectors are missing, how many documents with zero vectors

Parameters

output_format (str) – The format of the output. Must either be “dataframe” or “json”.

Example

from relevanceai import Client
client = Client()
df = client.Dataset("sample_dataset_id")
df.health
Return type

Union[DataFrame, dict]

value_counts(field)#

Return a Series containing counts of unique values.

Parameters

field (str) – dataset field to which to do value counts on

Return type

Series

Example

from relevanceai import Client
client = Client()
dataset_id = "sample_dataset_id"
df = client.Dataset(dataset_id)
field = "sample_field"
value_counts_df = df.value_counts(field)