lucidrains / x-clip Goto Github PK

View Code? Open in Web Editor NEW

651.0 25.0 46.0 1.49 MB

A concise but complete implementation of CLIP with various experimental improvements from recent papers

License: MIT License

Python 100.00%

artificial-intelligence deep-learning contrastive-learning zero-shot-learning multi-modal-learning

x-clip's Introduction

x-clip

A concise but complete implementation of CLIP with various experimental improvements from recent papers

Install

$ pip install x-clip

Usage

import torch
from x_clip import CLIP

clip = CLIP(
    dim_text = 512,
    dim_image = 512,
    dim_latent = 512,
    num_text_tokens = 10000,
    text_enc_depth = 6,
    text_seq_len = 256,
    text_heads = 8,
    visual_enc_depth = 6,
    visual_image_size = 256,
    visual_patch_size = 32,
    visual_heads = 8,
    visual_patch_dropout = 0.5,             # patch dropout probability, used in Kaiming He's FLIP to save compute and improve end results - 0.5 is good value, 0.75 on high end is tolerable
    use_all_token_embeds = False,           # whether to use fine-grained contrastive learning (FILIP)
    decoupled_contrastive_learning = True,  # use decoupled contrastive learning (DCL) objective function, removing positive pairs from the denominator of the InfoNCE loss (CLOOB + DCL)
    extra_latent_projection = True,         # whether to use separate projections for text-to-image vs image-to-text comparisons (CLOOB)
    use_visual_ssl = True,                  # whether to do self supervised learning on iages
    use_mlm = False,                        # use masked language learning (MLM) on text (DeCLIP)
    text_ssl_loss_weight = 0.05,            # weight for text MLM loss
    image_ssl_loss_weight = 0.05            # weight for image self-supervised learning loss
)

# mock data

text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)

# train

loss = clip(
    text,
    images,
    freeze_image_encoder = False,   # whether to freeze image encoder if using a pretrained image net, proposed by LiT paper
    return_loss = True              # needs to be set to True to return contrastive loss
)

loss.backward()

You can also pass in an external visual transformer / residual net. You simply have to make sure your image encoder returns a set of embeddings in the shape of batch x seq x dim, and make sure dim_image is properly specified as the dimension of the returned embeddings. Below is an example using vision transformer from vit_pytorch

$ pip install vit_pytorch>=0.25.6

import torch
from x_clip import CLIP

from vit_pytorch import ViT
from vit_pytorch.extractor import Extractor

base_vit = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 512,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

vit = Extractor(
    base_vit,
    return_embeddings_only = True
)

clip = CLIP(
    image_encoder = vit,
    dim_image = 512,           # must be set as the same dimensions as the vision transformer above
    dim_text = 512,
    dim_latent = 512,
    num_text_tokens = 10000,
    text_enc_depth = 6,
    text_seq_len = 256,
    text_heads = 8
)

text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)

loss = clip(text, images, return_loss = True)
loss.backward()

Finally, one can also have the text transformer be externally defined. It will need to return the embeddings including the CLS token, for now.

import torch
from x_clip import CLIP, TextTransformer

from vit_pytorch import ViT
from vit_pytorch.extractor import Extractor

base_vit = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 512,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

image_encoder = Extractor(
    base_vit,
    return_embeddings_only = True
)

text_encoder = TextTransformer(
    dim = 512,
    num_tokens = 10000,
    max_seq_len = 256,
    depth = 6,
    heads = 8
)

clip = CLIP(
    image_encoder = image_encoder,
    text_encoder = text_encoder,
    dim_image = 512,
    dim_text = 512,
    dim_latent = 512
)

text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)

loss = clip(text, images, return_loss = True)
loss.backward()

Multiview CL Losses

This repository also supports multiview contrastive learning loss, as proposed in DeCLIP. Just pass in the augmented text and/or augmented image, and it will be auto-calculated, weighed by multiview_loss_weight set on initialization.

ex.

import torch
from x_clip import CLIP, TextTransformer

from vit_pytorch import ViT
from vit_pytorch.extractor import Extractor

base_vit = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 512,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

image_encoder = Extractor(
    base_vit,
    return_embeddings_only = True
)

text_encoder = TextTransformer(
    dim = 512,
    num_tokens = 10000,
    max_seq_len = 256 + 1,
    depth = 6,
    heads = 8
)

clip = CLIP(
    image_encoder = image_encoder,
    text_encoder = text_encoder,
    dim_image = 512,
    dim_text = 512,
    dim_latent = 512,
    extra_latent_projection = True,
    multiview_loss_weight = 0.1         # weight multiview contrastive loss by 0.1
)

text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)

aug_text = torch.randint(0, 10000, (4, 256))  # augmented text (backtranslation or EDA), same dimensions as text
aug_images = torch.randn(4, 3, 256, 256)      # augmented images, same dimension as images above
loss = clip(
    text,
    images,
    aug_text = aug_text,           # pass in augmented texts
    aug_image = aug_images,        # pass in augmented images
    return_loss = True,
    freeze_image_encoder = True
)

loss.backward()

You can even send in more than one augmented text or image

# ...

aug_texts = (
    torch.randint(0, 10000, (4, 256)),
    torch.randint(0, 10000, (4, 256)),
)

aug_images = (
    torch.randn(4, 3, 256, 256),
    torch.randn(4, 3, 256, 256),
)

loss = clip(
    text,
    images,
    aug_text = aug_texts,
    aug_image = aug_images,
    return_loss = True,
    freeze_image_encoder = True
)

loss.backward()

Custom Vision Self-supervised Learning Module

You can pass in your own vision self-supervised learning module through the visual_ssl keyword as so

import torch
from x_clip import CLIP
from x_clip.visual_ssl import SimSiam

from vit_pytorch import ViT
from vit_pytorch.extractor import Extractor

base_vit = ViT(
    image_size = 256,
    patch_size = 32,
    num_classes = 1000,
    dim = 512,
    depth = 6,
    heads = 16,
    mlp_dim = 2048,
    dropout = 0.1,
    emb_dropout = 0.1
)

image_encoder = Extractor(
    base_vit,
    return_embeddings_only = True
)

visual_ssl = SimSiam(                 # SimSiam defined externally - needs to be a module that accepts an image of the same dimensions as CLIP and returns a scalar loss
    image_encoder,
    image_size = 256,
    hidden_layer = -1
)

clip = CLIP(
    image_encoder = image_encoder,
    dim_image = 512,
    dim_text = 512,
    dim_latent = 512,
    use_mlm = True,
    visual_ssl = visual_ssl,           # SSL module passed into CLIP
    use_all_token_embeds = False,
    extra_latent_projection = False,
    mlm_random_token_prob = 0.1
)

text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)

loss = clip(text, images, return_loss = True)
loss.backward()

Citations

@misc{radford2021learning,
    title   = {Learning Transferable Visual Models From Natural Language Supervision}, 
    author  = {Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
    year    = {2021},
    eprint  = {2103.00020},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@misc{yao2021filip,
    title   = {FILIP: Fine-grained Interactive Language-Image Pre-Training}, 
    author  = {Lewei Yao and Runhui Huang and Lu Hou and Guansong Lu and Minzhe Niu and Hang Xu and Xiaodan Liang and Zhenguo Li and Xin Jiang and Chunjing Xu},
    year    = {2021},
    eprint  = {2111.07783},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@misc{fürst2021cloob,
    title   = {CLOOB: Modern Hopfield Networks with InfoLOOB Outperform CLIP},
    author  = {Andreas Fürst and Elisabeth Rumetshofer and Viet Tran and Hubert Ramsauer and Fei Tang and Johannes Lehner and David Kreil and Michael Kopp and Günter Klambauer and Angela Bitto-Nemling and Sepp Hochreiter},
    year    = {2021},
    eprint  = {2110.11316},
    archivePrefix = {arXiv},
    primaryClass = {cs.LG}
}

@misc{yeh2021decoupled,
    title   = {Decoupled Contrastive Learning},
    author  = {Chun-Hsiao Yeh and Cheng-Yao Hong and Yen-Chi Hsu and Tyng-Luh Liu and Yubei Chen and Yann LeCun},
    year    = {2021},
    eprint  = {2110.06848},
    archivePrefix = {arXiv},
    primaryClass = {cs.LG}
}

@misc{zhai2021lit,
    title   = {LiT: Zero-Shot Transfer with Locked-image Text Tuning},
    author  = {Xiaohua Zhai and Xiao Wang and Basil Mustafa and Andreas Steiner and Daniel Keysers and Alexander Kolesnikov and Lucas Beyer},
    year    = {2021},
    eprint  = {2111.07991},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@misc{li2021supervision,
    title   = {Supervision Exists Everywhere: A Data Efficient Contrastive Language-Image Pre-training Paradigm},
    author  = {Yangguang Li and Feng Liang and Lichen Zhao and Yufeng Cui and Wanli Ouyang and Jing Shao and Fengwei Yu and Junjie Yan},
    year    = {2021},
    eprint  = {2110.05208},
    archivePrefix = {arXiv},
    primaryClass = {cs.CV}
}

@Article{mu2021slip,
    author  = {Norman Mu and Alexander Kirillov and David Wagner and Saining Xie},
    title   = {SLIP: Self-supervision meets Language-Image Pre-training},
    journal = {arXiv preprint arXiv:2112.12750},
    year    = {2021},
}

@misc{su2021roformer,
    title   = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
    author  = {Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu},
    year    = {2021},
    eprint  = {2104.09864},
    archivePrefix = {arXiv},
    primaryClass = {cs.CL}
}

@inproceedings{anonymous2022normformer,
    title   = {NormFormer: Improved Transformer Pretraining with Extra Normalization},
    author  = {Anonymous},
    booktitle = {Submitted to The Tenth International Conference on Learning Representations },
    year    = {2022},
    url     = {https://openreview.net/forum?id=GMYWzWztDx5},
    note    = {under review}
}

@inproceedings{Li2022ScalingLP,
    title   = {Scaling Language-Image Pre-training via Masking},
    author  = {Yanghao Li and Haoqi Fan and Ronghang Hu and Christoph Feichtenhofer and Kaiming He},
    year    = {2022}
}

@article{Liu2022PatchDropoutEV,
    title   = {PatchDropout: Economizing Vision Transformers Using Patch Dropout},
    author  = {Yue Liu and Christos Matsoukas and Fredrik Strand and Hossein Azizpour and Kevin Smith},
    journal = {ArXiv},
    year    = {2022},
    volume  = {abs/2208.07220}
}

@misc{shi2023enhance,
    title   = {Enhance audio generation controllability through representation similarity regularization}, 
    author  = {Yangyang Shi and Gael Le Lan and Varun Nagaraja and Zhaoheng Ni and Xinhao Mei and Ernie Chang and Forrest Iandola and Yang Liu and Vikas Chandra},
    year    = {2023},
    eprint  = {2309.08773},
    archivePrefix = {arXiv},
    primaryClass = {cs.SD}
}

x-clip's People

Contributors

Stargazers

Watchers

Forkers

dumpmemory afiaka87 kapitsa2811 panxiebit micpie theodoregalanos 16a0 ferjad laplacekorea pkulwj1994 froskekongen huang-xx zlapp hot5auce aniketmaurya peternara jeanru cv-ip soonhwan-kwon zeta1999 limore0129 jessicazhu191 mbilalai xingranzh buczuxd xdweixia shaonc dannielge 5l1v3r1 atifemreyuksel pysync gg-big-org wizyke shibin1027 aniketgurav xiao2mo ariaattar nagatoyuki0943 gptcrash shitou208 majidsas iamunr4v31 wucongquan loserfeng one-june

x-clip's Issues

Text and Vision tokens different from CLIP

Hi and thanks for all the work done in this repository!

I noticed that the implementation of the CLS token in the Vision transformer, as well as the tokens used in the text transformer are different in you're implementation.

As far as I understand, CLIP attaches a CLS embeddings token before patch embeddings are send through the transformer. In this repo, it seems like the mean is computed over all patch embeddings instead, meaning the CLS token has no learnable parameters.

In addition, CLIP uses a and a token which are combined with the token embeddings in the beginning and end, respectively. In you're implementation, the text transformer uses a single CLS token.

Iam trying to make use of the FILIP part and incorporate it into the openAI implementation of CLIP. Unfortunately Iam somewhat unsure about how to handle the text tokens in the fine-grained loss.

When comparing patch token embeddings to text token embeddings should I ignore both the and the tokens?

Thanks in advance,

kind regards,

[DOCUMENTATION REQUEST] add demonstration for loading openai pretrained clip weights

some quesiton about x_clip

Dear Lucidrains,

Thanks for your selfless contribution and outstanding work, it is beneficial. I'm an ML beginner, so my foundation is not solid. I have a question about x_clip.

I'm programming a student-oriented image-searching demo by CLIP, and I want to compare the difference between origin CLIP and x_clip; in the searching program, I can implement CLIP by (device = 'cuda' model, preprocess = clip.load('ViT-B/32', device=device)). Still, I'm not sure how to realise this function by x_clip. Would you please solve my question without disturbing you?

Extract Text and Image Latents

Hi, in the current implementation we can only extract text and image embedding (by set return_encodings=True) which are obtained before applying latent linear layers. Isn't it better to add an option to extract latent embeddings? Another importance of this is that with the current code, it is impossible to extract the similarity matrix between a batch of images and a batch of text.

extract features from CLIP with dim_text and dim_image other than 512

Can we extract embeddings of size (say dim_text = 256, dim_image = 256) other than 512 from a pre-trained CLIP?

Allow other types of visual SSL when initiating CLIP

In the following code as part of CLIP.__init__

        if use_visual_ssl:
            if visual_ssl_type == 'simsiam':
                ssl_type = SimSiam
            elif visual_ssl_type == 'simclr':
                ssl_type = partial(SimCLR, temperature = simclr_temperature)
            else:
                raise ValueError(f'unknown visual_ssl_type')

            self.visual_ssl = ssl_type(
                self.visual_transformer,
                image_size = visual_image_size,
                hidden_layer = visual_ssl_hidden_layer
            )

the visual self-supervised learning is hardcoded. I would suggest changing this to accept the visual SSL module as an argument when instantiating CLIP to allow flexibility in the same manner as it does for the image encoder and text encoder.

Example:

barlow = BarlowTwins(augmentatation_fns)
clip = CLIP(..., visual_ssl=barlow)

Using different encoders in CLIP

Hi,
I am wondering if it was possible to use different encoders in CLIP ?
For images not using vit but resnet for example.
And is it possible to replace the text encoder by a features encoder for example ? If I have a vector of features for a given image and I want to use x-clip how should I do that ?
I have made a code example that doesnt seems to work, here is what I did:

import torch
from x_clip import CLIP
import torch.nn as nn
from torchvision import models

class Image_Encoder(torch.nn.Module):
    #output size is (bs,512)
    def __init__(self):
        super(Image_Encoder, self).__init__()
        self.model_pre = models.resnet18(pretrained=False)
        self.base=nn.Sequential(*list(self.model_pre.children()))
        self.base[0]=nn.Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet=self.base[:-1]

    def forward(self, x):
        out=self.resnet(x).squeeze()
        return out


class features_encoder(torch.nn.Module):
    #output size is (bs,512)
    def __init__(self):
        super(features_encoder, self).__init__()
        self.model =nn.Linear(2048,512)

    def forward(self, x):
        out=self.model(x)
        return out

images_encoder=Image_Encoder()
features_encoder=features_encoder()

clip = CLIP(
    image_encoder = images_encoder,
    text_encoder = features_encoder,
    dim_image = 512,
    dim_text = 512,
    dim_latent = 512
)

features= torch.randn(4,2048)
images = torch.randn(4, 3, 256, 256)

loss = clip(features, images, return_loss = True)
loss.backward()

but I got the following error : forward() takes 2 positional arguments but 3 were given

Thanks

Visual ssl with channels different than 3

Hi,
seems to be a bug when trying to use visual ssl with a different number of channel than 3 . I think the error came from the visual ssl type ~row 280 here:

#send a mock image tensor to instantiate parameters
self.forward(torch.randn(1, 3, image_size, image_size))

Loss in -ve

Loss goes in -ve with mock data

Bad Escape character Error

I got this error from trying to run another repository of yours ( voicebox )
I got this error

  File "/root/work/code/mw_speech_synthesis/speech_synthesis/training/voicebox/voicebox_pytorch/__init__.py", line 1, in <module>
    from voicebox_pytorch.voicebox_pytorch import (
  File "/root/work/code/mw_speech_synthesis/speech_synthesis/training/voicebox/voicebox_pytorch/__init__.py", line 1, in <module>
    from voicebox_pytorch.voicebox_pytorch import (
  File "/root/work/code/mw_speech_synthesis/speech_synthesis/training/voicebox/voicebox_pytorch/voicebox_pytorch.py", line 28, in <module>
    from spear_tts_pytorch import TextToSemantic
  File "/root/.miniforge/envs/vb/lib/python3.10/site-packages/spear_tts_pytorch/__init__.py", line 1, in <module>
    from spear_tts_pytorch.spear_tts_pytorch import (
  File "/root/.miniforge/envs/vb/lib/python3.10/site-packages/spear_tts_pytorch/spear_tts_pytorch.py", line 27, in <module>
    from x_clip.tokenizer import tokenizer
  File "/root/.miniforge/envs/vb/lib/python3.10/site-packages/x_clip/tokenizer.py", line 169, in <module>
    tokenizer = SimpleTokenizer()
  File "/root/.miniforge/envs/vb/lib/python3.10/site-packages/x_clip/tokenizer.py", line 77, in __init__
    self.pat = re.compile(
  File "/root/.miniforge/envs/vb/lib/python3.10/re.py", line 251, in compile
    return _compile(pattern, flags)
  File "/root/.miniforge/envs/vb/lib/python3.10/re.py", line 303, in _compile
    p = sre_compile.compile(pattern, flags)
  File "/root/.miniforge/envs/vb/lib/python3.10/sre_compile.py", line 788, in compile
    p = sre_parse.parse(p, flags)
  File "/root/.miniforge/envs/vb/lib/python3.10/sre_parse.py", line 955, in parse
    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  File "/root/.miniforge/envs/vb/lib/python3.10/sre_parse.py", line 444, in _parse_sub
    itemsappend(_parse(source, state, verbose, nested + 1,
  File "/root/.miniforge/envs/vb/lib/python3.10/sre_parse.py", line 555, in _parse
    code1 = _class_escape(source, this)
  File "/root/.miniforge/envs/vb/lib/python3.10/sre_parse.py", line 350, in _class_escape
    raise source.error('bad escape %s' % escape, len(escape))
re.error: bad escape \p at position 59

The source is this line

self.pat = re.compile(
            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
            re.IGNORECASE)

Model forward outputs to text/image similarity score

Any insight on how to take the image/text embeddings (or nominal model forward output) to achieve a simple similarity score as done in the huggingface implementation? HF example here

In the original paper I see the dot products of the image/text encoder outputs were used, but here I was having troubles with the dimensions on the outputs.

Unable to train to convergence (small dataset)

Hi nice work with x-clip. Hoping to play around with it and eventually combine it into your DALLE2 work.

Currently having some trouble training on roughly 30k image-text pairs. Loss eventually goes negative and starts producing Nan's. I've dropped learning rate down (1e-4) and I'm clipping gradients (max_norm=0.5).

Any thoughts on what are sane training params/configs on such a small dataset using x-clip?

Suggest your favorite papers to add!

will start with

NaN with mock data

Hi lucidrains,

Try this and it will NaN within 100 steps (latest Github code). The loss looks fine before NaN.

import torch
torch.backends.cudnn.allow_tf32 = True
torch.backends.cuda.matmul.allow_tf32 = True    
torch.backends.cudnn.benchmark = True

import random
import numpy as np
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

num_text_tokens = 10000
batch_sz = 12
text_seq_len = 256
visual_image_size = 256

# mock data

data_sz = 1000
all_text = torch.randint(0, num_text_tokens, (data_sz, text_seq_len)).cuda()
all_images = torch.randn(data_sz, 3, visual_image_size, visual_image_size).cuda()

text = torch.zeros((batch_sz, text_seq_len), dtype=torch.long).cuda()
images = torch.zeros((batch_sz, 3, visual_image_size, visual_image_size)).cuda()

##########################################################################################

import wandb
import datetime
wandb.init(project="Test", name=datetime.datetime.today().strftime('%Y-%m-%d-%H-%M-%S'), save_code=False)

from x_clip import CLIP

clip = CLIP(
    dim_text = 512,
    dim_image = 512,
    dim_latent = 512,
    num_text_tokens = num_text_tokens,
    text_enc_depth = 6,
    text_seq_len = text_seq_len,
    text_heads = 8,
    visual_enc_depth = 6,
    visual_image_size = visual_image_size,
    visual_patch_size = 32,
    visual_heads = 8,
    use_all_token_embeds = False,           # whether to use fine-grained contrastive learning (FILIP)
    decoupled_contrastive_learning = True,  # use decoupled contrastive learning (DCL) objective function, removing positive pairs from the denominator of the InfoNCE loss (CLOOB + DCL)
    extra_latent_projection = True,         # whether to use separate projections for text-to-image vs image-to-text comparisons (CLOOB)
    use_visual_ssl = True,                  # whether to do self supervised learning on iages
    visual_ssl_type = 'simclr',             # can be either 'simclr' or 'simsiam', depending on using DeCLIP or SLIP
    use_mlm = False,                        # use masked language learning (MLM) on text (DeCLIP)
    text_ssl_loss_weight = 0.05,            # weight for text MLM loss
    image_ssl_loss_weight = 0.05            # weight for image self-supervised learning loss
).cuda()

optimizer = torch.optim.Adam(clip.parameters(), lr=1e-4, betas=(0.9, 0.99))

for step in range(999999):
    for i in range(batch_sz):
        data_id = random.randrange(0, data_sz - 1)
        text[i] = all_text[data_id]
        images[i] = all_images[data_id]

    loss = clip(
        text,
        images,
        freeze_image_encoder = False,   # whether to freeze image encoder if using a pretrained image net, proposed by LiT paper
        return_loss = True              # needs to be set to True to return contrastive loss
    )
    clip.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(clip.parameters(), 1.0)
    optimizer.step()

    now_loss = loss.item()
    wandb.log({"loss": now_loss}, step = step)
    print(step, now_loss)

    if 'nan' in str(now_loss):
        break