relevanceai.operations.text_finetuning.unsupervised_finetuning_ops#

Warning

This is a beta feature and will be changing in the future. Do not use this in production systems.

Example

Train a text model using GPL (Generative Pseudo-Labelling) This can be helpful for domain adaptation.

You can find out more about GPL from: https://github.com/UKPLab/gpl

from relevanceai import Client
client = Client()
ds = client.Dataset("ecommerce")
ops = GPLOps.from_dataset(dataset=ds,
    base_model="distilbert-base-uncased",
    t5_generator="BeIR/query-gen-msmarco-t5-base-v1",
    retrievers=["msmarco-distilbert-base-v3", "msmarco-MiniLM-L-6-v3"],
    cross_encoder="cross-encoder/ms-marco-MiniLM-L-6-v2",
    chunksize_gpl=16,
    output_path="trained_model",
)
ops.run(dataset=ds, text_field="detail_desc")

Module Contents#

class relevanceai.operations.text_finetuning.unsupervised_finetuning_ops.GPLOps(base_model: str = 'distilbert-base-uncased', t5_generator: str = 'BeIR/query-gen-msmarco-t5-base-v1', retrievers: List[str] = ['msmarco-distilbert-base-v3', 'msmarco-MiniLM-L-6-v3'], cross_encoder: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2', chunksize_gpl: int = 16, output_path: str = 'trained_model', credentials: Optional[relevanceai.client.helpers.Credentials] = None)#

Batch API client

prepare_data_for_finetuning(self, documents: List[dict], text_field: str, dir_to_save_corpus: str, title_field: Optional[str] = None, corpus_filename: str = 'corpus.jsonl')#
fine_tune(self, path_to_generated_data: str = '.', output_dir: str = 'trained_model', gpl_steps: int = 500, do_evaluation: bool = False, qgen_prefix: str = 'qgen', **gpl_kwargs)#
get_model(self, output_path: Optional[str] = None)#
run(self, dataset: str, text_field: str, gpl_steps: int = 500, path_to_generated_data: str = '.', output_dir: str = 'trained_model', dir_to_save_corpus: str = '.', do_evaluation: bool = False)#

Finetune a model using Generative Pseudo-Labelling

Example

from relevanceai import Client
client = Client()

ds = client.Dataset("quickstart")
# !pip install -q gpl
from relevanceai.operations.text_finetuning.unsupervised_finetuning_ops import GPLOps
ops = GPLOps.from_dataset(ds)
ops.run(dataset=ds, text_field="product_title")
Parameters
  • text_field (str) – The field you want to use for fine-tuning

  • dir_to_save_corpus (str) – path to save the corpus that is going to be used by the GPL alg.

  • gpl_steps (int) – The number of steps in Generative Pseudo Labelling

  • path_to_generated_data (str) – The path to generated data

  • output_dir (str) – The path of the output directory

  • dir_to_save_corpus – The directory to save corpus

  • do_evaluation (bool) – If True, it performs the evaluation

classmethod from_client(self, client, *args, **kwargs)#
classmethod from_dataset(self, dataset: Any, base_model: str = 'distilbert-base-uncased', t5_generator: str = 'BeIR/query-gen-msmarco-t5-base-v1', retrievers: List[str] = ['msmarco-distilbert-base-v3', 'msmarco-MiniLM-L-6-v3'], cross_encoder: str = 'cross-encoder/ms-marco-MiniLM-L-6-v2', chunksize_gpl: int = 16, output_path: str = 'trained_model', **kwargs)#