Skip to content

NLP_CONNECT_VIT_GPT2

The NLP_CONNECT_VIT_GPT2 node captions an input image and produces an output string wrapped in a dataframe.Params:default : ImageThe image to caption.Returns:out : DataFrameDataFrame containing the caption column and a single row.
Python Code
from flojoy import flojoy, run_in_venv, DataFrame, Image


@flojoy
@run_in_venv(
    pip_dependencies=[
        "transformers==4.30.2",
        "torch~=2.0.1",
        "torchvision~=0.15.2",
    ],
    verbose=True,
)
def NLP_CONNECT_VIT_GPT2(default: Image) -> DataFrame:
    """The NLP_CONNECT_VIT_GPT2 node captions an input image and produces an output string wrapped in a dataframe.

    Parameters
    ----------
    default : Image
        The image to caption.

    Returns
    -------
    DataFrame
        DataFrame containing the caption column and a single row.
    """

    import numpy as np
    import pandas as pd

    import transformers
    import torch
    import torchvision.transforms.functional as TF
    from flojoy import DataFrame, snapshot_download

    r, g, b, a = default.r, default.g, default.b, default.a
    nparray = (
        np.stack((r, g, b, a), axis=2) if a is not None else np.stack((r, g, b), axis=2)
    )
    image = TF.to_pil_image(nparray).convert("RGB")

    # Download repo to local flojoy cache
    local_repo_path = snapshot_download(
        repo_id="nlpconnect/vit-gpt2-image-captioning",
        revision="dc68f91c06a1ba6f15268e5b9c13ae7a7c514084",
        local_dir_use_symlinks=False,
    )
    # Load model objects
    model = transformers.VisionEncoderDecoderModel.from_pretrained(local_repo_path)
    feature_extractor = transformers.ViTImageProcessor.from_pretrained(local_repo_path)
    tokenizer = transformers.AutoTokenizer.from_pretrained(local_repo_path)

    with torch.inference_mode():
        pixel_values = feature_extractor(
            images=[image], return_tensors="pt"
        ).pixel_values  # type: ignore
        output_ids = model.generate(
            pixel_values, max_length=16, num_beams=4
        )  # type: ignore
        preds = tokenizer.batch_decode(
            output_ids, skip_special_tokens=True
        )  # type: ignore
        pred = preds[0].strip()

    df_pred = pd.DataFrame.from_records([(pred,)], columns=["caption"])

    return DataFrame(df=df_pred)

Find this Flojoy Block on GitHub

Example

Having problem with this example app? Join our Discord community and we will help you out!
React Flow mini map

In this example, the LOCAL_FILE node loads a local file and passes it to NLP_CONNECT_VIT_GPT2, which produces the appropriate image caption.