Source code for backprop.tasks.text_generation

from typing import List, Tuple, Union, Dict
from backprop.models import AutoModel, BaseModel
from .base import Task
from backprop.utils.datasets import TextToTextDataset

import requests
from transformers.optimization import Adafactor

TASK = "text-generation"

DEFAULT_LOCAL_MODEL = "english-base"

LOCAL_ALIASES = {
    "english": "gpt2-medium",
    "english-small": "distilgpt2",
    "english-base": "gpt2-medium",
    "english-large": "gpt2-large",
    "gpt2": "gpt2-medium"
}

[docs]class TextGeneration(Task): """ Task for text generation. Attributes: model: 1. Model name 2. Model name on Backprop's text-generation endpoint 3. Model object that implements the text-generation task local (optional): Run locally. Defaults to False api_key (optional): Backprop API key for non-local inference device (optional): Device to run inference on. Defaults to "cuda" if available. """ def __init__(self, model: Union[str, BaseModel] = None, local: bool = False, api_key: str = None, device: str = None): models = AutoModel.list_models(task=TASK) super().__init__(model, local=local, api_key=api_key, device=device, models=models, task=TASK, default_local_model=DEFAULT_LOCAL_MODEL, local_aliases=LOCAL_ALIASES)
[docs] @staticmethod def list_models(return_dict=False, display=False, limit=None): """ Returns the list of models that can be used and finetuned with this task. Args: return_dict: Default False. True if you want to return in dict form. Otherwise returns list form. display: Default False. True if you want output printed directly (overrides return_dict, and returns nothing). limit: Default None. Maximum number of models to return -- leave None to get all models. """ return AutoModel.list_models(task=TASK, return_dict=return_dict, display=display, limit=limit, aliases=LOCAL_ALIASES)
[docs] def __call__(self, text: Union[str, List[str]], min_length: int = None, max_length: int = None, temperature: float = None, top_k: int = None, top_p: float = None, repetition_penalty: float = None, length_penalty: float = None, num_beams: int = None, num_generations: int = None, do_sample: bool = None): """Generates text to continue from the given input. Args: input_text (string): Text from which the model will begin generating. min_length (int): Minimum number of tokens to generate (1 token ~ 1 word). max_length (int): Maximum number of tokens to generate (1 token ~ 1 word). temperature (float): Value that alters the randomness of generation (0.0 is no randomness, higher values introduce randomness. 0.5 - 0.7 is a good starting point). top_k (int): Only choose from the top_k tokens when generating (0 is no limit). top_p (float): Only choose from the top tokens with combined probability greater than top_p. repetition_penalty (float): Penalty to be applied to tokens present in the input_text and tokens already generated in the sequence (>1 discourages repetition while <1 encourages). length_penalty (float): Penalty applied to overall sequence length. Set >1 for longer sequences, or <1 for shorter ones. num_beams (int): Number of beams to be used in beam search. Does a number of generations to pick the best one. (1: no beam search) num_generations (int): How many times to run generation. Results are returned as a list. do_sample (bool): Whether or not sampling strategies (temperature, top_k, top_p) should be used. Example:: import backprop tg = backprop.TextGeneration() tg("Geralt knew the sings, the monster was a", min_length=20, max_length=50, temperature=0.7) > " real danger, and he was the only one in the village who knew how to defend himself." """ params = [("text", text), ("min_length", min_length), ("max_length", max_length), ("temperature", temperature), ("top_k", top_k), ("top_p", top_p), ("repetition_penalty", repetition_penalty), ("length_penalty", length_penalty), ("num_beams", num_beams), ("num_generations", num_generations), ("do_sample", do_sample)] # Ignore None to let the model decide optimal values task_input = {k: v for k, v in params if v != None} if self.local: return self.model(task_input, task=TASK) else: task_input["model"] = self.model res = requests.post("https://api.backprop.co/text-generation", json=task_input, headers={"x-api-key": self.api_key}).json() if res.get("message"): raise Exception(f"Failed to make API request: {res['message']}") return res["output"]
[docs] def step(self, batch, batch_idx): """ Performs a training step and returns loss. Args: batch: Batch output from the dataloader batch_idx: Batch index. """ return self.model.training_step(batch)
[docs] def configure_optimizers(self): """ Returns default optimizer for text generation (AdaFactor, learning rate 1e-3) """ return Adafactor(params=self.model.parameters(), lr=1e-3, scale_parameter=False, relative_step=False)
[docs] def finetune(self, params, validation_split: Union[float, Tuple[List[int], List[int]]] = 0.15, max_input_length: int = 128, max_output_length: int = 32, epochs: int = 20, batch_size: int = None, optimal_batch_size: int = None, early_stopping_epochs: int = 1, train_dataloader = None, val_dataloader = None, step = None, configure_optimizers = None): """ Finetunes a model for a text generation task. Note: input_text and output_text in params must have matching ordering (item 1 of input must match item 1 of output) Args: params: Dictionary of model inputs. Contains 'input_text' and 'output_text' keys, with values as lists of input/output data. max_input_length: Maximum number of tokens (1 token ~ 1 word) in input. Anything higher will be truncated. Max 512. max_output_length: Maximum number of tokens (1 token ~ 1 word) in output. Anything higher will be truncated. Max 512. validation_split: Float between 0 and 1 that determines what percentage of the data to use for validation. epochs: Integer specifying how many training iterations to run. batch_size: Batch size when training. Leave as None to automatically determine batch size. optimal_batch_size: Optimal batch size for the model being trained -- defaults to model settings. early_stopping_epochs: Integer determining how many epochs will run before stopping without an improvement in validation loss. train_dataloader: Dataloader for providing training data when finetuning. Defaults to inbuilt dataloder. val_dataloader: Dataloader for providing validation data when finetuning. Defaults to inbuilt dataloader. step: Function determining how to call model for a training step. Defaults to step defined in this task class. configure_optimizers: Function that sets up the optimizer for training. Defaults to optimizer defined in this task class. Examples:: import backprop tg = backprop.TextGeneration() # Any text works as training data inp = ["I really liked the service I received!", "Meh, it was not impressive."] out = ["positive", "negative"] params = {"input_text": inp, "output_text": out} # Finetune tg.finetune(params) """ input_text = params["input_text"] output_text = params["output_text"] assert len(input_text) == len(output_text), "The input lists must match" optimal_batch_size = getattr(self.model, "optimal_batch_size", 128) configure_optimizers = configure_optimizers or self.configure_optimizers step = step or self.step dataset_params = { "input": input_text, "output": output_text, "max_input_length": max_input_length, "max_output_length": max_output_length } print("Processing data...") dataset = TextToTextDataset(dataset_params, task=TASK, process_batch=self.model.process_batch, length=len(input_text)) super().finetune(dataset=dataset, validation_split=validation_split, epochs=epochs, batch_size=batch_size, optimal_batch_size=optimal_batch_size, early_stopping_epochs=early_stopping_epochs, step=step, configure_optimizers=configure_optimizers, train_dataloader=train_dataloader, val_dataloader=val_dataloader)