Lucidrains 系列项目源码解析（十七）

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\reversible.py

#导入火炬库

进口手电筒

#将神经网络模块导入torch

将torch.nn 导入为nn

#从operator模块导入item getter函数

从操作员导入itemgetter

# 从torch.autograd.function 模块导入Function 类

从torch.autograd.function 导入函数

#从torch.utils.checkpoint模块导入get_device_states和set_device_states函数

# 将参数路由到可逆层函数的函数

defroute_arguments（路由器，参数，深度）:

# 路由初始化后的参数列表

Routed_args=[(dict(), dict()) _ 范围(深度)]

# 获取参数中与路由器匹配的key

matched_keys=[args.keys() 中的键的键，用于路由器中的键]

# 循环遍历匹配的键

对于matched_keys:中的键

val=参数[键]

# 遍历路由器中的路由后参数列表和路由

对于深度， ((f_args, g_args), Routes) in enumerate(zip(routed_args, router[key])):

# 按照路线，将参数添加到对应的函数参数中

new_f_args, new_g_args=地图(lambda route: ({key: val} if 路由else {}), 路由)

Routed_args[深度]=({**f_args, **new_f_args}, {**g_args, **new_g_args})

返回routed_args

# 请参阅保存和配置随机数生成器的示例https://pytorch.org/docs/stable/_modules/torch/utils/checkpoint.html

类决定论(nn.Module):

def __init__(self, net):

超级().__init__()

self.net=网络

self.cpu_state=无

self.cuda_in_fwd=无

self.gpu_devices=无

self.gpu_states=无

def Record_rng(self, *args):

self.cpu_state=torch.get_rng_state()

对于torch.cuda._initialized:

self.cuda_in_fwd=True

self.gpu_devices, self.gpu_states=get_device_states(*args)

defforward(self, *args, Record_rng=False, set_rng=False, **kwargs):

对于记录_rng:

self.record_rng(*args)

否则设置_rng:

返回self.net(*args, **kwargs)

rng_设备=[]

如果self.cuda_in_fwd:

rng_devices=self.gpu_devices

使用torch.random.fork_rng(devices=rng_devices,enabled=True):

torch.set_rng_state(self.cpu_state)

如果self.cuda_in_fwd:

set_device_states(self.gpu_devices, self.gpu_states)

返回self.net(*args, **kwargs)

#https://github.com/RobinBruegger/RevTorch/blob/master/revtorch/revtorch.py

# 一旦确定多GPU 正常工作，请重构并将PR 发送回源代码

类ReversibleBlock(nn.Module):

def __init__(self, f, g):

超级().__init__()

self.f=确定性(f)

self.g=确定性(g)

defforward(self, x, f_args={}, g_args={}):

x1, x2=torch.chunk(x, 2, 暗淡=2)

y1、y2=无、无

使用torch.no_grad():

y1=x1 + self.f(x2, record_rng=self.training, **f_args)

y2=x2 + self.g(y1, Record_rng=self.training, **g_args)

返回torch.cat([y1, y2], 暗淡=2)

defverse_pass(self, y, dy, f_args={}, g_args={}):

y1, y2=torch.chunk(y, 2, 暗淡=2)

德鲁伊

dy1, dy2=torch.chunk(dy, 2, 暗淡=2)

德尔迪

使用torch.enable_grad():

y1.requires_grad=True

gy1=self.g(y1, set_rng=True, **g_args)

torch.autograd.backward(gy1, dy2)

使用torch.no_grad():

x2=y2 – gy1

德尔y2, gy1

dx1=dy1 + y1.grad

德尔迪1

y1.grad=无

使用torch.enable_grad():

x2.requires_grad=True

fx2=self.f(x2, set_rng=True, **f_args)

torch.autograd.backward（fx2，dx1，retain_graph=True）

使用torch.no_grad():

x1=y1 – fx2

删除y1、fx2

dx2=dy2 + x2.grad

德尔迪2

x2.grad=无

x=torch.cat([x1, x2.detach()], 暗淡=2)

dx=torch.cat([dx1, dx2], 暗淡=2)

返回x, dx

类_ReversibleFunction(函数):

@静态方法

def 前进（ctx，x，块，args）:

ctx.args=参数

对于块，kwarg(blocks, args): in zip

x=块(x, **kwarg)

ctx.y=x.detach()

ctx.blocks=块

@静态方法

# 定义一个反向传播函数，以上下文和梯度为参数

def 向后（ctx，dy）:

# 在上下文中获取y 和args

y=ctx.y

args=ctx.args

# 遍历上下文中的块和参数

对于块，zip 中的kwargs(ctx.blocks[:-1], args[:-1]):

# 调用每个块的反向传播函数来更新y 和dy

y, dy=block.backward_pass(y, dy, **kwargs)

# 返回更新后的梯度

返回dy，无，无

类SequentialSequence(nn.Module):

# 定义顺序运行的神经网络模块

def __init__(自身，层，args_route={}，layer_dropout=0。):

超级().__init__()

# 断言每个参数的根图深度等于连续层数

assert all(len(route)==len(layers) for Route in args_route.values()), \’每个参数的路由映射的深度必须等于连续层数\’

self.layers=层

self.args_route=args_route

self.layer_dropout=layer_dropout

def 前进（自我，x，**kwargs）:

# 根据参数路由和关键字参数获取参数

args=Route_args(self.args_route, kwargs, len(self.layers))

Layers_and_args=列表(zip(self.layers, args))

（f，g），（f_args，g_args）layers_and_args:

# 在顺序层中执行函数f和g并将结果添加到输入x。

x=x + f(x, **f_args)

x=x + g(x, **g_args)

类ReversibleSequence(nn.Module):

# 定义可逆神经网络模块

def __init__(自身，块， args_route={}):

超级().__init__()

self.args_route=args_route

# 创建包含可逆块的模块列表

self.blocks=nn.ModuleList([ReversibleBlock(f=f, g=g) for f, g (块单位)])

def 前进（自我，x，**kwargs）:

# 在最后一个维度连接输入x

x=torch.cat([x, x], 暗淡=-1)

块=self.blocks

# 根据参数路由和关键字参数获取参数

args=Route_args(self.args_route, kwargs, len(块))

args=list(map(lambda x: {\’f_args\’: x[0], \’g_args\’: x[1]}, args))

# 调用自定义可逆函数_ReversibleFunction进行可逆操作

out=_ReversibleFunction.apply(x, 块， args)

# 在最后一个维度上将输出分成两部分并取平均值。

返回torch.stack(out.chunk(2, dim=-1)).mean(dim=0)

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\transformer.py

#导入需要的库

从集合中导入两个终端

可从collections.abc import 进行迭代

从functools 部分导入

从itertools导入islice和cycle

进口手电筒

火炬导入nn，来自einsum

将torch.nn.function 导入为F

重新定位einops 的进口

#导入自定义模块

从dalle_pytorch.reversible 导入ReversibleSequence、SequentialSequence

从dalle_pytorch.attention导入导入Attendance、SparseAttendant、SparseConvCausalAttend、SparseAxialCausalAttendance。

#导入旋转嵌入模块

从RotaryEmbedding Torch 导入RotaryEmbedding、广播

# 辅助函数

# 判断变量是否存在

默认存在(val):

返回值不为None

# 返回默认值

默认默认（val，d）:

如果存在则返回val(val)，否则返回d

# 将变量转换为元组

def Cast_tuple(val, 深度=1):

isinstance(val, Iterable) 返回val else (val,) * 深度

善良

#最大分割类别

类DivideMax(nn.Module):

def __init__(self, 暗淡):

超级().__init__()

self.dim=暗淡

def 前进（自身，x）:

maxes=x.amax(dim=self.dim, keepdim=True).detach()

返回x/最大值

#非缓存类

类NonCached(nn.Module):

””

本身不支持推理缓存的层的包装器。

重建层之前的完整序列，

剪切层后输出的后缀。

””

def __init__(self, fn):

超级().__init__()

self.fn=fn

defforward(self,x,*,cache=none,cache_key=none,**kwargs):

n=x.形状[-2]

如果存在（缓存）:

如果cache_key在cache:中

x=torch.cat([缓存[缓存键], x], 暗淡=-2)

缓存[缓存键]=x

out=self.fn(x, **kwargs)

返回[:-n:]

# 缓存类

类CachedAs(nn.Module):

””

定义推理缓存键的包装器。

””

def __init__(自身，cache_key，fn):

超级().__init__()

self.cache_key=缓存键

self.fn=fn

def 转发（自我，x，*，缓存=无，**kwargs）:

返回self.fn(x,cache=cache,cache_key=self.cache_key,**kwargs)

#图层缩放类

类LayerScale(nn.Module):

def __init__(self, 暗淡，深度， fn):

超级().__init__()

如果深度=18:

初始化eps=0.1

elif 深度18 且深度=24:

init_eps=1e-5

: 其他

初始化eps=1e-6

比例=torch.zeros(1, 1, 暗淡).fill_(init_eps)

self.scale=nn.parameter(scale)

self.fn=fn

def 前进（自我，x，**kwargs）:

返回self.fn(x, **kwargs) * self.scale

#层归一化类

PreNorm 类(nn.Module):

def __init__(self, 暗淡， fn, 三明治=False):

超级().__init__()

self.norm=nn.LayerNorm(dim)

self.norm_out=nn.LayerNorm(dim) if 三明治else nn.Identity()

self.fn=fn

def 前进（自我，x，**kwargs）:

x=self.norm(x)

x=self.fn(x, **kwargs)

返回self.norm_out(x)

# 前馈类

类GEGLU(nn.Module):

def 前进（自身，x）:

x, 门=x.chunk(2, 暗淡=-1)

返回x * F.gelu(门)

前馈类（nn.Module）:

def __init__(self, 暗淡， dropout=0. mult=4.):

超级().__init__()

self.net=nn.Sequential(

nn.Linear(dim, dim * mult * 2),

格古鲁(),

nn.Dropout（辍学），

nn.Linear(dim * mult, 暗淡)

）

defforward(self,x,cache=none,cache_key=none):

返回self.net(x)

# 标记班次

类PreShiftToken(nn.Module):

def __init__(self, fn, image_size, seq_len):

超级().__init__()

self.fn=fn

self.image_size=图像大小

self.seq_len=seq_len

self.img_seq_len=图像大小** 2

self.text_len=seq_len – self.img_seq_len + 1

# 定义一个前向传播函数，接受输入x、缓存cache、缓存键cache_key和其他关键字参数kwargs

defforward(self,x,cache=None,cache_key=None,**kwargs):

# 获取序列长度、图像大小、文本长度

seq_len、image_size、text_len=self.seq_len、self.image_size、self.text_len

# 如果缓存存在且缓存键存在于缓存中

如果有

ts(cache) and cache_key in cache:
# 从缓存中获取偏移量
offset = cache[\’offset\’]
# 断言偏移量大于等于文本长度，不支持文本的缓存推断
assert offset >= text_len, \”cached inference for text is not supported\”
# 从缓存中获取队列 q
q = cache[cache_key]
# 断言 q 是双端队列且长度为图像大小
assert isinstance(q, deque) and len(q) == image_size
# 将输入 x 按照最后一个维度分割成四部分
x_top, x_left, *x_pass = x[:, -1].chunk(4, dim=-1)
# 将 x_top 和 x_left 添加到队列 q 中
q.append((x_top, x_left))
# 弹出队列 q 中的第一个元素，并更新 x_top 和 x_left
x_top = q.popleft()[0]
x_left = q[-2][1]
# 如果偏移量减去文本长度对图像大小取模等于 0，则将 x_left 置零
if (offset – text_len) % image_size == 0:
x_left = torch.zeros_like(x_left)
# 将 x_top、x_left 和其他部分拼接在一起
x = torch.cat((x_top, x_left, *x_pass), dim=-1)
# 调用 self.fn 函数，传入 x[:, None] 作为输入，同时传入缓存和其他关键字参数
return self.fn(x[:, None], cache=cache, **kwargs)
# 获取输入 x 的形状中的第二个维度大小
n = x.shape[1]
# 计算需要填充的数量
padding = seq_len – n + 1
# 如果序列长度小于文本长度，则没有图像令牌需要移动
if n < text_len:
return self.fn(x, **kwargs)
# 获取文本和图像令牌
x_text, x_img = x[:, :text_len], x[:, text_len:]
# 对图像令牌进行填充
x_img = F.pad(x_img, (0, 0, 0, padding))
# 重新排列图像令牌的形状
x_img = rearrange(x_img, \’b (h w) d -> b h w d\’, h=image_size)
# 对文本令牌进行左移 1 位
x_text_shift, x_text_pass = x_text.chunk(2, dim=-1)
x_text_shift = F.pad(x_text_shift, (0, 0, 1, -1))
x_text = torch.cat((x_text_shift, x_text_pass), dim=-1)
# 对图像令��进行从上和从左的移动
x_img_shift_top, x_img_shift_left, *x_img_pass = x_img.chunk(4, dim=-1)
x_img_shift_left = F.pad(x_img_shift_left, (0, 0, 1, -1))
x_img_shift_top = F.pad(x_img_shift_top, (0, 0, 0, 0, 1, -1))
x_img = torch.cat((x_img_shift_top, x_img_shift_left, *x_img_pass), dim=-1)
# 将文本和图像序列合并在一起
x_img = rearrange(x_img, \’b h w d -> b (h w) d\’)
x_img = x_img[:, :-padding]
x = torch.cat((x_text, x_img), dim=1)
# 如果缓存存在
if exists(cache):
# 创建虚拟的顶部和左侧令牌
dummy_top, dummy_left, *_ = x[:, -1].chunk(4, dim=-1)
dummy_top, dummy_left = torch.zeros_like(dummy_top), torch.zeros_like(dummy_left)
# 创建双端队列 q
q = deque()
x_img = x_img[:, -image_size:]
# 将虚拟令牌添加到队列 q 中，直到队列大小为图像大小
for _ in range(image_size – x_img.shape[1]):
q.append((dummy_top, dummy_left))
# 将图像令牌添加到队列 q 中
for i in range(x_img.shape[1]):
q.append(x_img[:, i].chunk(4, dim=-1)[:2])
# 将队列 q 存入缓存中
cache[cache_key] = q
# 调用 self.fn 函数，传入 x 作为输入，同时传入缓存和其他关键字参数
return self.fn(x, cache=cache, **kwargs)
# 主要的Transformer类
class Transformer(nn.Module):
# 初始化函数
def __init__(
self,
*,
dim,
depth,
seq_len,
reversible = False,
causal = True,
heads = 8,
dim_head = 64,
ff_mult = 4,
attn_dropout = 0.,
ff_dropout = 0.,
attn_types = None,
image_fmap_size = None,
sparse_attn = False,
stable = False,
sandwich_norm = False,
shift_tokens = False,
rotary_emb = True,
shared_attn_ids = None,
shared_ff_ids = None,
optimize_for_inference = False, # 使用缓存友好的掩码注意力代替稀疏注意力
# 前向传播函数
def forward(self, x, **kwargs):
return self.layers(x, rotary_pos_emb = self.pos_emb, **kwargs)
# 获取注意力掩码函数
def _get_attention_mask(self, attn_type):
# 计算图像序列长度
img_seq_len = self.image_fmap_size ** 2
# 计算文本长度
text_len = self.seq_len + 1 – img_seq_len
# 创建静态掩码
static_mask = torch.zeros(self.seq_len, self.seq_len, dtype=torch.bool)
static_mask[:, :text_len] = True
# 根据不同的注意力类型生成不同的静态掩码
if attn_type == \’axial_row\’:
for row in range(self.image_fmap_size):
begin = text_len + row * self.image_fmap_size
end = text_len + (row + 1) * self.image_fmap_size
static_mask[begin:end, begin:end] = True
elif attn_type == \’axial_col\’:
for col in range(self.image_fmap_size):
begin = text_len + col
static_mask[begin::self.image_fmap_size, begin::self.image_fmap_size] = True
else:
raise ValueError(f\’attention type \”{attn_type}\” can\\\’t be simulated with a static mask\’)
return static_mask

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\vae.py

# 导入所需的库
import io
import sys
import os
import requests
import PIL
import warnings
import hashlib
import urllib
import yaml
from pathlib import Path
from tqdm import tqdm
from math import sqrt, log
from packaging import version
# 导入第三方库
from omegaconf import OmegaConf
from taming.models.vqgan import VQModel, GumbelVQ
import importlib
# 导入 PyTorch 库
import torch
from torch import nn
import torch.nn.functional as F
# 导入 einops 库
from einops import rearrange
# 导入 dalle_pytorch 库中的 distributed_utils 模块
from dalle_pytorch import distributed_utils
# 常量定义
CACHE_PATH = os.path.expanduser(\”~/.cache/dalle\”)
OPENAI_VAE_ENCODER_PATH = \’https://cdn.openai.com/dall-e/encoder.pkl\’
OPENAI_VAE_DECODER_PATH = \’https://cdn.openai.com/dall-e/decoder.pkl\’
VQGAN_VAE_PATH = \’https://heibox.uni-heidelberg.de/f/140747ba53464f49b476/?dl=1\’
VQGAN_VAE_CONFIG_PATH = \’https://heibox.uni-heidelberg.de/f/6ecf2af6c658432c8298/?dl=1\’
# 辅助方法
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
def load_model(path):
with open(path, \’rb\’) as f:
return torch.load(f, map_location = torch.device(\’cpu\’))
def map_pixels(x, eps = 0.1):
return (1 – 2 * eps) * x + eps
def unmap_pixels(x, eps = 0.1):
return torch.clamp((x – eps) / (1 – 2 * eps), 0, 1)
def download(url, filename = None, root = CACHE_PATH):
if (
not distributed_utils.is_distributed
or distributed_utils.backend.is_local_root_worker()
):
os.makedirs(root, exist_ok = True)
filename = default(filename, os.path.basename(url))
download_target = os.path.join(root, filename)
download_target_tmp = os.path.join(root, f\’tmp.{filename}\’)
if os.path.exists(download_target) and not os.path.isfile(download_target):
raise RuntimeError(f\”{download_target} exists and is not a regular file\”)
if (
distributed_utils.is_distributed
and not distributed_utils.backend.is_local_root_worker()
and not os.path.isfile(download_target)
):
# 如果文件尚不存在，则等待根工作节点下载
distributed_utils.backend.local_barrier()
if os.path.isfile(download_target):
return download_target
with urllib.request.urlopen(url) as source, open(download_target_tmp, \”wb\”) as output:
with tqdm(total=int(source.info().get(\”Content-Length\”)), ncols=80) as loop:
while True:
buffer = source.read(8192)
if not buffer:
break
output.write(buffer)
loop.update(len(buffer))
os.rename(download_target_tmp, download_target)
if (
distributed_utils.is_distributed
and distributed_utils.backend.is_local_root_worker()
):
distributed_utils.backend.local_barrier()
return download_target
def make_contiguous(module):
with torch.no_grad():
for param in module.parameters():
param.set_(param.contiguous())
# 获取包版本信息
def get_pkg_version(pkg_name):
from pkg_resources import get_distribution
return get_distribution(pkg_name).version
# 预训练的 OpenAI 离散 VAE
class OpenAIDiscreteVAE(nn.Module):
def __init__(self):
super().__init__()
assert version.parse(get_pkg_version(\’torch\’)) < version.parse(\’1.11.0\’), \’torch version must be <= 1.10 in order to use OpenAI discrete vae\’
# 加载编码器和解码器模型
self.enc = load_model(download(OPENAI_VAE_ENCODER_PATH))
self.dec = load_model(download(OPENAI_VAE_DECODER_PATH))
make_contiguous(self)
self.channels = 3
self.num_layers = 3
self.image_size = 256
self.num_tokens = 8192
@torch.no_grad()
def get_codebook_indices(self, img):
# 映射像素值
img = map_pixels(img)
# 获取编码器的输出
z_logits = self.enc.blocks(img)
# 获取最大概率的索引
z = torch.argmax(z_logits, dim = 1)
return rearrange(z, \’b h w -> b (h w)\’)
# 解码函数，将图像序列解码为图像
def decode(self, img_seq):
# 获取图像序列的形状
b, n = img_seq.shape
# 重新排列图像序列的形状，将其转换为二维图像
img_seq = rearrange(img_seq, \’b (h w) -> b h w\’, h = int(sqrt(n)))
# 将图像序列转换为 one-hot 编码
z = F.one_hot(img_seq, num_classes = self.num_tokens)
# 重新排列 one-hot 编码的形状
z = rearrange(z, \’b h w c -> b c h w\’).float()
# 使用解码器解码 one-hot 编码的数据
x_stats = self.dec(z).float()
# 将解码后的数据映射回像素值范围
x_rec = unmap_pixels(torch.sigmoid(x_stats[:, :3]))
# 返回解码后的图像
return x_rec
# 前向传播函数，抛出未实现异常
def forward(self, img):
raise NotImplemented
# 从 Taming Transformers 论文中获取 VQGAN 模型
# https://arxiv.org/abs/2012.09841
# 从字符串中获取对象
def get_obj_from_str(string, reload=False):
# 拆分字符串，获取模块和类名
module, cls = string.rsplit(\”.\”, 1)
if reload:
# 导入模块并重新加载
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
# 根据配置实例化对象
def instantiate_from_config(config):
if not \”target\” in config:
raise KeyError(\”Expected key `target` to instantiate.\”)
return get_obj_from_str(config[\”target\”])(**config.get(\”params\”, dict()))
# VQGAN VAE 类
class VQGanVAE(nn.Module):
def __init__(self, vqgan_model_path=None, vqgan_config_path=None):
super().__init__()
if vqgan_model_path is None:
model_filename = \’vqgan.1024.model.ckpt\’
config_filename = \’vqgan.1024.config.yml\’
download(VQGAN_VAE_CONFIG_PATH, config_filename)
download(VQGAN_VAE_PATH, model_filename)
config_path = str(Path(CACHE_PATH) / config_filename)
model_path = str(Path(CACHE_PATH) / model_filename)
else:
model_path = vqgan_model_path
config_path = vqgan_config_path
config = OmegaConf.load(config_path)
model = instantiate_from_config(config[\”model\”])
state = torch.load(model_path, map_location = \’cpu\’)[\’state_dict\’]
model.load_state_dict(state, strict = False)
print(f\”Loaded VQGAN from {model_path} and {config_path}\”)
self.model = model
# 计算分辨率缩放因子 f
f = config.model.params.ddconfig.resolution / config.model.params.ddconfig.attn_resolutions[0]
self.num_layers = int(log(f)/log(2))
self.channels = 3
self.image_size = 256
self.num_tokens = config.model.params.n_embed
self.is_gumbel = isinstance(self.model, GumbelVQ)
self._register_external_parameters()
def _register_external_parameters(self):
\”\”\”为 DeepSpeed 分区注册外部参数\”\”\”
if (
not distributed_utils.is_distributed
or not distributed_utils.using_backend(
distributed_utils.DeepSpeedBackend)
):
return
deepspeed = distributed_utils.backend.backend_module
deepspeed.zero.register_external_parameter(
self, self.model.quantize.embed.weight if self.is_gumbel else self.model.quantize.embedding.weight)
@torch.no_grad()
def get_codebook_indices(self, img):
b = img.shape[0]
img = (2 * img) – 1
_, _, [_, _, indices] = self.model.encode(img)
if self.is_gumbel:
return rearrange(indices, \’b h w -> b (h w)\’, b=b)
return rearrange(indices, \'(b n) -> b n\’, b = b)
def decode(self, img_seq):
b, n = img_seq.shape
one_hot_indices = F.one_hot(img_seq, num_classes = self.num_tokens).float()
z = one_hot_indices @ self.model.quantize.embed.weight if self.is_gumbel \\
else (one_hot_indices @ self.model.quantize.embedding.weight)
z = rearrange(z, \’b (h w) c -> b c h w\’, h = int(sqrt(n)))
img = self.model.decode(z)
img = (img.clamp(-1., 1.) + 1) * 0.5
return img
def forward(self, img):
raise NotImplemented

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\version.py

# 定义变量 __version__，赋值为字符串 \’1.6.6\’
__version__ = \’1.6.6\’

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\init.py

# 从dalle_pytorch包中导入DALLE, CLIP, DiscreteVAE类
# 从dalle_pytorch包中导入OpenAIDiscreteVAE, VQGanVAE类
from dalle_pytorch.dalle_pytorch import DALLE, CLIP, DiscreteVAE
from dalle_pytorch.vae import OpenAIDiscreteVAE, VQGanVAE
# 从pkg_resources模块中导入get_distribution函数
from pkg_resources import get_distribution
# 从dalle_pytorch.version模块中导入__version__变量
from dalle_pytorch.version import __version__

.\\lucidrains\\DALLE-pytorch\\generate.py

# 导入必要的库
import argparse
from pathlib import Path
from tqdm import tqdm
# 导入 torch 库
import torch
# 导入 einops 库中的 repeat 函数
from einops import repeat
# 导入 vision 相关库
from PIL import Image
from torchvision.utils import make_grid, save_image
# 导入 dalle_pytorch 库中的类和工具
from dalle_pytorch import __version__
from dalle_pytorch import DiscreteVAE, OpenAIDiscreteVAE, VQGanVAE, DALLE
from dalle_pytorch.tokenizer import tokenizer, HugTokenizer, YttmTokenizer, ChineseTokenizer
# 参数解析
parser = argparse.ArgumentParser()
# 添加参数
parser.add_argument(\’–dalle_path\’, type = str, required = True,
help=\’path to your trained DALL-E\’)
parser.add_argument(\’–vqgan_model_path\’, type=str, default = None,
help=\’path to your trained VQGAN weights. This should be a .ckpt file. (only valid when taming option is enabled)\’)
parser.add_argument(\’–vqgan_config_path\’, type=str, default = None,
help=\’path to your trained VQGAN config. This should be a .yaml file. (only valid when taming option is enabled)\’)
parser.add_argument(\’–text\’, type = str, required = True,
help=\’your text prompt\’)
parser.add_argument(\’–num_images\’, type = int, default = 128, required = False,
help=\’number of images\’)
parser.add_argument(\’–batch_size\’, type = int, default = 4, required = False,
help=\’batch size\’)
parser.add_argument(\’–top_k\’, type = float, default = 0.9, required = False,
help=\’top k filter threshold\’)
parser.add_argument(\’–outputs_dir\’, type = str, default = \’./outputs\’, required = False,
help=\’output directory\’)
parser.add_argument(\’–bpe_path\’, type = str,
help=\’path to your huggingface BPE json file\’)
parser.add_argument(\’–hug\’, dest=\’hug\’, action = \’store_true\’)
parser.add_argument(\’–chinese\’, dest=\’chinese\’, action = \’store_true\’)
parser.add_argument(\’–taming\’, dest=\’taming\’, action=\’store_true\’)
parser.add_argument(\’–gentxt\’, dest=\’gentxt\’, action=\’store_true\’)
# 解析参数
args = parser.parse_args()
# 辅助函数
def exists(val):
return val is not None
# 根据参数设置 tokenizer
if exists(args.bpe_path):
klass = HugTokenizer if args.hug else YttmTokenizer
tokenizer = klass(args.bpe_path)
elif args.chinese:
tokenizer = ChineseTokenizer()
# 加载 DALL-E 模型
dalle_path = Path(args.dalle_path)
assert dalle_path.exists(), \’trained DALL-E must exist\’
load_obj = torch.load(str(dalle_path))
dalle_params, vae_params, weights, vae_class_name, version = load_obj.pop(\’hparams\’), load_obj.pop(\’vae_params\’), load_obj.pop(\’weights\’), load_obj.pop(\’vae_class_name\’, None), load_obj.pop(\’version\’, None)
# 友好打印
if exists(version):
print(f\’Loading a model trained with DALLE-pytorch version {version}\’)
else:
print(\’You are loading a model trained on an older version of DALL-E pytorch – it may not be compatible with the most recent version\’)
# 加载 VAE 模型
if args.taming:
vae = VQGanVAE(args.vqgan_model_path, args.vqgan_config_path)
elif vae_params is not None:
vae = DiscreteVAE(**vae_params)
else:
vae = OpenAIDiscreteVAE()
assert not (exists(vae_class_name) and vae.__class__.__name__ != vae_class_name), f\’you trained DALL-E using {vae_class_name} but are trying to generate with {vae.__class__.__name__} – please make sure you are passing in the correct paths and settings for the VAE to use for generation\’
# 重建 DALL-E 模型
dalle = DALLE(vae = vae, **dalle_params).cuda()
dalle.load_state_dict(weights)
# 生成图片
image_size = vae.image_size
texts = args.text.split(\’|\’)
for j, text in tqdm(enumerate(texts)):
if args.gentxt:
text_tokens, gen_texts = dalle.generate_texts(tokenizer, text=text, filter_thres = args.top_k)
text = gen_texts[0]
else:
text_tokens = tokenizer.tokenize([text], dalle.text_seq_len).cuda()
text_tokens = repeat(text_tokens, \'() n -> b n\’, b = args.num_images)
outputs = []
# 使用 tqdm 分块处理文本标记，每块大小为 args.batch_size，显示进度条描述为生成图像的文本
for text_chunk in tqdm(text_tokens.split(args.batch_size), desc = f\’generating images for – {text}\’):
# 生成图像，根据文本块和筛选阈值 args.top_k
output = dalle.generate_images(text_chunk, filter_thres = args.top_k)
# 将生成的图像添加到输出列表中
outputs.append(output)
# 将所有输出图像拼接成一个张量
outputs = torch.cat(outputs)
# 保存所有图像
# 定义文件名为文本
file_name = text
# 定义输出目录为 args.outputs_dir 下的文件名替换空格为下划线后的前100个字符
outputs_dir = Path(args.outputs_dir) / file_name.replace(\’ \’, \’_\’)[:(100)]
# 创建输出目录，如果不存在则创建，存在则忽略
outputs_dir.mkdir(parents = True, exist_ok = True)
# 遍历输出图像，保存为 PNG 格式
for i, image in tqdm(enumerate(outputs), desc = \’saving images\’):
# 保存图像为 PNG 格式，文件名为序号.png，进行归一化
save_image(image, outputs_dir / f\'{i}.png\’, normalize=True)
# 将文本写入 caption.txt 文件
with open(outputs_dir / \’caption.txt\’, \’w\’) as f:
f.write(file_name)
# 打印生成的图像数量和输出目录路径
print(f\’created {args.num_images} images at \”{str(outputs_dir)}\”\’)

DALL-E in Pytorch

Released DALLE Models Web-Hostable DALLE Checkpoints

Yannic Kilcher’s video

Implementation / replication of DALL-E (paper), OpenAI\’s Text to Image Transformer, in Pytorch. It will also contain CLIP for ranking the generations.

Quick Start

Deep Daze or Big Sleep are great alternatives!

For generating video and audio, please see NÜWA

Appreciation

This library could not have been possible without the contributions of janEbert, Clay, robvanvolt, Romain Beaumont, and Alexander! 🙏

Status

Hannu has managed to train a small 6 layer DALL-E on a dataset of just 2000 landscape images! (2048 visual tokens)

Kobiso, a research engineer from Naver, has trained on the CUB200 dataset here, using full and deepspeed sparse attention

(3/15/21) afiaka87 has managed one epoch using a reversible DALL-E and the dVaE here
TheodoreGalanos has trained on 150k layouts with the following results

–
Rom1504 has trained on 50k fashion images with captions with a really small DALL-E (2 layers) for just 24 hours with the following results

afiaka87 trained for 6 epochs on the same dataset as before thanks to the efficient 16k VQGAN with the following results

Thanks to the amazing “mega b#6696” you can generate from this checkpoint in colab –

(5/2/21) First 1.3B DALL-E from 🇷🇺 has been trained and released to the public! 🎉
(4/8/22) Moving onwards to DALLE-2!

Install

$ pip install dalle-pytorch

Usage

Train VAE

import torch
from dalle_pytorch import DiscreteVAE
vae = DiscreteVAE(
image_size = 256,
num_layers = 3, # number of downsamples – ex. 256 / (2 ** 3) = (32 x 32 feature map)
num_tokens = 8192, # number of visual tokens. in the paper, they used 8192, but could be smaller for downsized projects
codebook_dim = 512, # codebook dimension
hidden_dim = 64, # hidden dimension
num_resnet_blocks = 1, # number of resnet blocks
temperature = 0.9, # gumbel softmax temperature, the lower this is, the harder the discretization
straight_through = False, # straight-through for gumbel softmax. unclear if it is better one way or the other
)
images = torch.randn(4, 3, 256, 256)
loss = vae(images, return_loss = True)
loss.backward()
# train with a lot of data to learn a good codebook

Train DALL-E with pretrained VAE from above

import torch
from dalle_pytorch import DiscreteVAE, DALLE
vae = DiscreteVAE(
image_size = 256,
num_layers = 3,
num_tokens = 8192,
codebook_dim = 1024,
hidden_dim = 64,
num_resnet_blocks = 1,
temperature = 0.9
)
dalle = DALLE(
dim = 1024,
vae = vae, # automatically infer (1) image sequence length and (2) number of image tokens
num_text_tokens = 10000, # vocab size for text
text_seq_len = 256, # text sequence length
depth = 12, # should aim to be 64
heads = 16, # attention heads
dim_head = 64, # attention head dimension
attn_dropout = 0.1, # attention dropout
ff_dropout = 0.1 # feedforward dropout
)
text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)
loss = dalle(text, images, return_loss = True)
loss.backward()
# do the above for a long time with a lot of data … then
images = dalle.generate_images(text)
images.shape # (4, 3, 256, 256)

To prime with a starting crop of an image, simply pass two more arguments

img_prime = torch.randn(4, 3, 256, 256)
images = dalle.generate_images(
text,
img = img_prime,
num_init_img_tokens = (14 * 32) # you can set the size of the initial crop, defaults to a little less than ~1/2 of the tokens, as done in the paper
)
images.shape # (4, 3, 256, 256)

You may also want to generate text using DALL-E. For that call this function:

text_tokens, texts = dalle.generate_texts(tokenizer, text)

OpenAI’s Pretrained VAE

You can also skip the training of the VAE altogether, using the pretrained model released by OpenAI! The wrapper class should take care of downloading and caching the model for you auto-magically.

import torch
from dalle_pytorch import OpenAIDiscreteVAE, DALLE
vae = OpenAIDiscreteVAE() # loads pretrained OpenAI VAE
dalle = DALLE(
dim = 1024,
vae = vae, # automatically infer (1) image sequence length and (2) number of image tokens
num_text_tokens = 10000, # vocab size for text
text_seq_len = 256, # text sequence length
depth = 1, # should aim to be 64
heads = 16, # attention heads
dim_head = 64, # attention head dimension
attn_dropout = 0.1, # attention dropout
ff_dropout = 0.1 # feedforward dropout
)
text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)
loss = dalle(text, images, return_loss = True)
loss.backward()

Taming Transformer’s Pretrained VQGAN VAE

You can also use the pretrained VAE offered by the authors of Taming Transformers! Currently only the VAE with a codebook size of 1024 is offered, with the hope that it may train a little faster than OpenAI’s, which has a size of 8192.

In contrast to OpenAI’s VAE, it also has an extra layer of downsampling, so the image sequence length is 256 instead of 1024 (this will lead to a 16 reduction in training costs, when you do the math). Whether it will generalize as well as the original DALL-E is up to the citizen scientists out there to discover.

Update – it works!

from dalle_pytorch import VQGanVAE
vae = VQGanVAE()
# the rest is the same as the above example

The default VQGan is the codebook size 1024 one trained on imagenet. If you wish to use a different one, you can use the vqgan_model_path and vqgan_config_path to pass the .ckpt file and the .yaml file. These options can be used both in train-dalle script or as argument of VQGanVAE class. Other pretrained VQGAN can be found in taming transformers readme. If you want to train a custom one you can follow this guide

Adjust text conditioning strength

Recently there has surfaced a new technique for guiding diffusion models without a classifier. The gist of the technique involves randomly dropping out the text condition during training, and at inference time, deriving the rough direction from unconditional to conditional distributions.

Katherine Crowson outlined in a tweet how this could work for autoregressive attention models. I have decided to include her idea in this repository for further exploration. One only has to account for two extra keyword arguments on training (null_cond_prob) and generation (cond_scale).

import torch
from dalle_pytorch import DiscreteVAE, DALLE
vae = DiscreteVAE(
image_size = 256,
num_layers = 3,
num_tokens = 8192,
codebook_dim = 1024,
hidden_dim = 64,
num_resnet_blocks = 1,
temperature = 0.9
)
dalle = DALLE(
dim = 1024,
vae = vae,
num_text_tokens = 10000,
text_seq_len = 256,
depth = 12,
heads = 16,
dim_head = 64,
attn_dropout = 0.1,
ff_dropout = 0.1
)
text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)
loss = dalle(
text,
images,
return_loss = True,
null_cond_prob = 0.2 # firstly, set this to the probability of dropping out the condition, 20% is recommended as a default
)
loss.backward()
# do the above for a long time with a lot of data … then
images = dalle.generate_images(
text,
cond_scale = 3. # secondly, set this to a value greater than 1 to increase the conditioning beyond average
)
images.shape # (4, 3, 256, 256)

That’s it!

Ranking the generations

Train CLIP

import torch
from dalle_pytorch import CLIP
clip = CLIP(
dim_text = 512,
dim_image = 512,
dim_latent = 512,
num_text_tokens = 10000,
text_enc_depth = 6,
text_seq_len = 256,
text_heads = 8,
num_visual_tokens = 512,
visual_enc_depth = 6,
visual_image_size = 256,
visual_patch_size = 32,
visual_heads = 8
)
text = torch.randint(0, 10000, (4, 256))
images = torch.randn(4, 3, 256, 256)
mask = torch.ones_like(text).bool()
loss = clip(text, images, text_mask = mask, return_loss = True)
loss.backward()

To get the similarity scores from your trained Clipper, just do

images, scores = dalle.generate_images(text, mask = mask, clip = clip)
scores.shape # (2,)
images.shape # (2, 3, 256, 256)
# do your topk here, in paper they sampled 512 and chose top 32

Or you can just use the official CLIP model to rank the images from DALL-E

Scaling depth

In the blog post, they used 64 layers to achieve their results. I added reversible networks, from the Reformer paper, in order for users to attempt to scale depth at the cost of compute. Reversible networks allow you to scale to any depth at no memory cost, but a little over 2x compute cost (each layer is rerun on the backward pass).

Simply set the reversible keyword to True for the DALLE class

dalle = DALLE(
dim = 1024,
vae = vae,
num_text_tokens = 10000,
text_seq_len = 256,
depth = 64,
heads = 16,
reversible = True # <– reversible networks https://arxiv.org/abs/2001.04451
)

Sparse Attention

The blogpost alluded to a mixture of different types of sparse attention, used mainly on the image (while the text presumably had full causal attention). I have done my best to replicate these types of sparse attention, on the scant details released. Primarily, it seems as though they are doing causal axial row / column attention, combined with a causal convolution-like attention.

By default DALLE will use full attention for all layers, but you can specify the attention type per layer as follows.

full full attention
axial_row axial attention, along the rows of the image feature map
axial_col axial attention, along the columns of the image feature map
conv_like convolution-like attention, for the image feature map

The sparse attention only applies to the image. Text will always receive full attention, as said in the blogpost.

dalle = DALLE(
dim = 1024,
vae = vae,
num_text_tokens = 10000,
text_seq_len = 256,
depth = 64,
heads = 16,
reversible = True,
attn_types = (\’full\’, \’axial_row\’, \’axial_col\’, \’conv_like\’) # cycles between these four types of attention
)

Deepspeed Sparse Attention

You can also train with Microsoft Deepspeed’s Sparse Attention, with any combination of dense and sparse attention that you’d like. However, you will have to endure the installation process.

First, you need to install Deepspeed with Sparse Attention

$ sh install_deepspeed.sh

Next, you need to install the pip package triton. It will need to be a version < 1.0 because that’s what Microsoft used.

$ pip install triton==0.4.2

If both of the above succeeded, now you can train with Sparse Attention!

dalle = DALLE(
dim = 512,
vae = vae,
num_text_tokens = 10000,
text_seq_len = 256,
depth = 64,
heads = 8,
attn_types = (\’full\’, \’sparse\’) # interleave sparse and dense attention for 64 layers
)

Training

This section will outline how to train the discrete variational autoencoder as well as the final multi-modal transformer (DALL-E). We are going to use Weights & Biases for all the experiment tracking.

(You can also do everything in this section in a Google Colab, link below)

Train in Colab

$ pip install wandb

Followed by

$ wandb login

VAE

To train the VAE, you just need to run

$ python train_vae.py –image_folder /path/to/your/images

If you installed everything correctly, a link to the experiments page should show up in your terminal. You can follow your link there and customize your experiment, like the example layout below.

You can of course open up the training script at ./train_vae.py, where you can modify the constants, what is passed to Weights & Biases, or any other tricks you know to make the VAE learn better.

Model will be saved periodically to ./vae.pt

In the experiment tracker, you will have to monitor the hard reconstruction, as we are essentially teaching the network to compress images into discrete visual tokens for use in the transformer as a visual vocabulary.

Weights and Biases will allow you to monitor the temperature annealing, image reconstructions (encoder and decoder working properly), as well as to watch out for codebook collapse (where the network decides to only use a few tokens out of what you provide it).

Once you have trained a decent VAE to your satisfaction, you can move on to the next step with your model weights at ./vae.pt.

DALL-E Training

Training using an Image-Text-Folder

Now you just have to invoke the ./train_dalle.py script, indicating which VAE model you would like to use, as well as the path to your folder if images and text.

The dataset I am currently working with contains a folder of images and text files, arbitraily nested in subfolders, where text file name corresponds with the image name, and where each text file contains multiple descriptions, delimited by newlines. The script will find and pair all the image and text files with the same names, and randomly select one of the textual descriptions during batch creation.

ex.

📂image-and-text-data
┣ 📜cat.png
┣ 📜cat.txt
┣ 📜dog.jpg
┣ 📜dog.txt
┣ 📜turtle.jpeg
┗ 📜turtle.txt
“`py
ex. `cat.txt`
“`py
A black and white cat curled up next to the fireplace
A fireplace, with a cat sleeping next to it
A black cat with a red collar napping
“`py
If you have a dataset with its own directory structure for tying together image and text descriptions, do let me know in the issues, and I\’ll see if I can accommodate it in the script.
“`py
$ python train_dalle.py –vae_path ./vae.pt –image_text_folder /path/to/data
“`py
You likely will not finish DALL-E training as quickly as you did your Discrete VAE. To resume from where you left off, just run the same script, but with the path to your DALL-E checkpoints.
“`py
$ python train_dalle.py –dalle_path ./dalle.pt –image_text_folder /path/to/data
“`py
## Training using WebDataset
WebDataset files are regular .tar(.gz) files which can be streamed and used for DALLE-pytorch training.
You Just need to provide the image (first comma separated argument) and caption (second comma separated argument)
column key after the –wds argument. The —image_text_folder points to your .tar(.gz) file instead of the datafolder.
“`py
$ python train_dalle.py –wds img,cap –image_text_folder /path/to/data.tar(.gz)
“`py
Distributed training with deepspeed works the same way, e.g.:
“`py
$ deepspeed train_dalle.py –wds img,cap –image_text_folder /path/to/data.tar(.gz) –fp16 –deepspeed
“`py
If you have containing shards (dataset split into several .tar(.gz) files), this is also supported:
“`py
$ deepspeed train_dalle.py –wds img,cap –image_text_folder /path/to/shardfolder –fp16 –deepspeed
“`py
You can stream the data from a http server or gloogle cloud storage like this:
“`py
$ deepspeed train_dalle.py –image_text_folder \”http://storage.googleapis.com/nvdata-openimages/openimages-train-{000000..000554}.tar\” –wds jpg,json –taming –truncate_captions –random_resize_crop_lower_ratio=0.8 –attn_types=full –epochs=2 –fp16 –deepspeed
“`py
In order to convert your image-text-folder to WebDataset format, you can make use of one of several methods.
(https://www.youtube.com/watch?v=v_PacO-3OGQ here are given 4 examples, or a little helper script which also supports splitting your dataset
into shards of .tar.gz files https://github.com/robvanvolt/DALLE-datasets/blob/main/wds_create_shards.py)
### DALL-E with OpenAI\’s VAE
You can now also train DALL-E without having to train the Discrete VAE at all, courtesy to their open-sourcing their model. You simply have to invoke the `train_dalle.py` script without specifying the `–vae_path`
“`py
$ python train_dalle.py –image_text_folder /path/to/coco/dataset
“`py
### DALL-E with Taming Transformer\’s VQVAE
Just use the `–taming` flag. Highly recommended you use this VAE over the OpenAI one!
“`py
$ python train_dalle.py –image_text_folder /path/to/coco/dataset –taming
“`py
### Generation
Once you have successfully trained DALL-E, you can then use the saved model for generation!
“`py
$ python generate.py –dalle_path ./dalle.pt –text \’fireflies in a field under a full moon\’
“`py
You should see your images saved as `./outputs/{your prompt}/{image number}.jpg`
To generate multiple images, just pass in your text with \’|\’ character as a separator.
ex.
“`py
$ python generate.py –dalle_path ./dalle.pt –text \’a dog chewing a bone|a cat chasing mice|a frog eating a fly\’
“`py
Note that DALL-E is a full image+text language model. As a consequence you can also generate text using a dalle model.
“`py
$ python generate.py –dalle_path ./dalle.pt –text \’a dog chewing a bone\’ –gentext
“`py
This will complete the provided text, save it in a caption.txt and generate the corresponding images.
### Docker
You can use a docker container to make sure the version of Pytorch and Cuda are correct for training DALL-E. <a href=\”https://docs.docker.com/get-docker/\”>Docker</a> and <a href=\’#\’>Docker Container Runtime</a> should be installed.
To build:
“`py
docker build -t dalle docker
“`py
To run in an interactive shell:
“`py
docker run –gpus all -it –mount src=\”$(pwd)\”,target=/workspace/dalle,type=bind dalle:latest bash
“`py
### Distributed Training
#### DeepSpeed
Thanks to <a href=\”https://github.com/janEbert\”>janEbert</a>, the repository is now equipped so you can train DALL-E with Microsoft\’s <a href=\”https://www.deepspeed.ai/\”>Deepspeed</a>!
You can simply replace any `$ python <file>.py [args…]` command with
“`py
$ deepspeed <file>.py [args…] –deepspeed
“`py
to use the aforementioned DeepSpeed library for distributed training, speeding up your experiments.
Modify the `deepspeed_config` dictionary in `train_dalle.py` or
`train_vae.py` according to the DeepSpeed settings you\’d like to use
for each one. See the [DeepSpeed configuration
docs](https://www.deepspeed.ai/docs/config-json/) for more
information.
#### DeepSpeed – 32 and 16 bit Precision
As of DeepSpeed version 0.3.16, ZeRO optimizations can be used with
single-precision floating point numbers. If you are using an older
version, you\’ll have to pass the `–fp16` flag to be able to enable
ZeRO optimizations.
#### DeepSpeed – Apex Automatic Mixed Precision.
Automatic mixed precision is a stable alternative to fp16 which still provides a decent speedup.
In order to run with Apex AMP (through DeepSpeed), you will need to install DeepSpeed using either the Dockerfile or the bash script.
Then you will need to install apex from source.
This may take awhile and you may see some compilation warnings which can be ignored.
“`py
sh install_apex.sh
“`py
Now, run `train_dalle.py` with `deepspeed` instead of `python` as done here:
“`py
deepspeed train_dalle.py \\
–taming \\
–image_text_folder \’DatasetsDir\’ \\
–distr_backend \’deepspeed\’ \\
–amp
“`py
#### Horovod
[Horovod](https://horovod.ai) offers a stable way for data parallel
training.
After [installing
Horovod](https://github.com/lucidrains/DALLE-pytorch/wiki/Horovod-Installation),
replace any `$ python <file>.py [args…]` command with
“`py
$ horovodrun -np <num-gpus> <file>.py [args…] –distributed_backend horovod
“`py
to use the Horovod library for distributed training, speeding up your
experiments. This will multiply your effective batch size per training
step by `<num-gpus>`, so you may need to rescale the learning rate
accordingly.
#### Custom Tokenizer
This repository supports custom tokenization with <a href=\”https://github.com/VKCOM/YouTokenToMe\”>YouTokenToMe</a>, if you wish to use it instead of the default simple tokenizer. Simply pass in an extra `–bpe_path` when invoking `train_dalle.py` and `generate.py`, with the path to your BPE model file.
The only requirement is that you use `0` as the padding during tokenization
ex.
“`py
$ python train_dalle.py –image_text_folder ./path/to/data –bpe_path ./path/to/bpe.model
“`py
To create a BPE model file from scratch, firstly
“`py
$ pip install youtokentome
“`py
Then you need to prepare a big text file that is a representative sample of the type of text you want to encode. You can then invoke the `youtokentome` command-line tools. You\’ll also need to specify the vocab size you wish to use, in addition to the corpus of text.
“`py
$ yttm bpe –vocab_size 8000 –data ./path/to/big/text/file.txt –model ./path/to/bpe.model

That’s it! The BPE model file is now saved to ./path/to/bpe.model and you can begin training!

Chinese

You can train with a pretrained chinese tokenizer offered by Huggingface 🤗 by simply passing in an extra flag –chinese

ex.

$ python train_dalle.py –chinese –image_text_folder ./path/to/data

$ python generate.py –chinese –text \’追老鼠的猫\’

Citations

@misc{ramesh2021zeroshot,
title = {Zero-Shot Text-to-Image Generation},
author = {Aditya Ramesh and Mikhail Pavlov and Gabriel Goh and Scott Gray and Chelsea Voss and Alec Radford and Mark Chen and Ilya Sutskever},
year = {2021},
eprint = {2102.12092},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}

@misc{unpublished2021clip,
title = {CLIP: Connecting Text and Images},
author = {Alec Radford, Ilya Sutskever, Jong Wook Kim, Gretchen Krueger, Sandhini Agarwal},
year = {2021}
}

@misc{kitaev2020reformer,
title = {Reformer: The Efficient Transformer},
author = {Nikita Kitaev and Łukasz Kaiser and Anselm Levskaya},
year = {2020},
eprint = {2001.04451},
archivePrefix = {arXiv},
primaryClass = {cs.LG}
}

@misc{esser2021taming,
title = {Taming Transformers for High-Resolution Image Synthesis},
author = {Patrick Esser and Robin Rombach and Björn Ommer},
year = {2021},
eprint = {2012.09841},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}

@misc{ding2021cogview,
title = {CogView: Mastering Text-to-Image Generation via Transformers},
author = {Ming Ding and Zhuoyi Yang and Wenyi Hong and Wendi Zheng and Chang Zhou and Da Yin and Junyang Lin and Xu Zou and Zhou Shao and Hongxia Yang and Jie Tang},
year = {2021},
eprint = {2105.13290},
archivePrefix = {arXiv},
primaryClass = {cs.CV}
}

@software{peng_bo_2021_5196578,
author = {PENG Bo},
title = {BlinkDL/RWKV-LM: 0.01},
month = {aug},
year = {2021},
publisher = {Zenodo},
version = {0.01},
doi = {10.5281/zenodo.5196578},
url = {https://doi.org/10.5281/zenodo.5196578}
}

@misc{su2021roformer,
title = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
author = {Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu},
year = {2021},
eprint = {2104.09864},
archivePrefix = {arXiv},
primaryClass = {cs.CL}
}

@inproceedings{ho2021classifierfree,
title = {Classifier-Free Diffusion Guidance},
author = {Jonathan Ho and Tim Salimans},
booktitle = {NeurIPS 2021 Workshop on Deep Generative Models and Downstream Applications},
year = {2021},
url = {https://openreview.net/forum?id=qw8AKxfYbI}
}

@misc{crowson2022,
author = {Katherine Crowson},
url = {https://twitter.com/RiversHaveWings/status/1478093658716966912}
}

@article{Liu2023BridgingDA,
title = {Bridging Discrete and Backpropagation: Straight-Through and Beyond},
author = {Liyuan Liu and Chengyu Dong and Xiaodong Liu and Bin Yu and Jianfeng Gao},
journal = {ArXiv},
year = {2023},
volume = {abs/2304.08612}
}

Those who do not want to imitate anything, produce nothing. – Dali

.\\lucidrains\\DALLE-pytorch\\setup.py

# 导入设置工具和查找包工具
from setuptools import setup, find_packages
# 执行版本文件中的代码，将版本信息导入当前环境
exec(open(\’dalle_pytorch/version.py\’).read())
# 设置包的元信息
setup(
# 包名
name = \’dalle-pytorch\’,
# 查找所有包
packages = find_packages(),
# 包含所有数据文件
include_package_data = True,
# 版本号
version = __version__,
# 许可证
license=\’MIT\’,
# 描述
description = \’DALL-E – Pytorch\’,
# 作者
author = \’Phil Wang\’,
# 作者邮箱
author_email = \’lucidrains@gmail.com\’,
# 长描述内容类型
long_description_content_type = \’text/markdown\’,
# 项目链接
url = \’https://github.com/lucidrains/dalle-pytorch\’,
# 关键词
keywords = [
\’artificial intelligence\’,
\’attention mechanism\’,
\’transformers\’,
\’text-to-image\’
],
# 安装依赖
install_requires=[
\’axial_positional_embedding\’,
\’DALL-E\’,
\’einops>=0.3.2\’,
\’ftfy\’,
\’packaging\’,
\’pillow\’,
\’regex\’,
\’rotary-embedding-torch\’,
\’taming-transformers-rom1504\’,
\’tokenizers\’,
\’torch>=1.6\’,
\’torchvision\’,
\’transformers\’,
\’tqdm\’,
\’youtokentome\’,
\’WebDataset\’
],
# 分类
classifiers=[
\’Development Status :: 4 – Beta\’,
\’Intended Audience :: Developers\’,
\’Topic :: Scientific/Engineering :: Artificial Intelligence\’,
\’License :: OSI Approved :: MIT License\’,
\’Programming Language :: Python :: 3.6\’,
],
)

.\\lucidrains\\DALLE-pytorch\\train_dalle.py

# 导入必要的库
import argparse
from pathlib import Path
import time
from glob import glob
import os
import shutil
import torch
import wandb # 如果用户没有安装 wandb，则提前退出
from torch.nn.utils import clip_grad_norm_
from torch.optim import Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
# 导入 DALL-E 相关模块
from dalle_pytorch import __version__
from dalle_pytorch import OpenAIDiscreteVAE, VQGanVAE, DiscreteVAE, DALLE
from dalle_pytorch import distributed_utils
from dalle_pytorch.loader import TextImageDataset
from dalle_pytorch.tokenizer import tokenizer, HugTokenizer, ChineseTokenizer, YttmTokenizer
# 导入用于支持 webdataset 的库
import webdataset as wds
from torchvision import transforms as T
from PIL import Image
from io import BytesIO
# 参数解析
parser = argparse.ArgumentParser()
group = parser.add_mutually_exclusive_group(required=False)
# 添加参数：离散 VAE 的路径
group.add_argument(\’–vae_path\’, type=str,
help=\’path to your trained discrete VAE\’)
# 添加参数：部分训练的 DALL-E 的路径
group.add_argument(\’–dalle_path\’, type=str,
help=\’path to your partially trained DALL-E\’)
# 添加参数：训练好的 VQGAN 权重路径
parser.add_argument(\’–vqgan_model_path\’, type=str, default=None,
help=\’path to your trained VQGAN weights. This should be a .ckpt file. (only valid when taming option is enabled)\’)
# 添加参数：训练好的 VQGAN 配置路径
parser.add_argument(\’–vqgan_config_path\’, type=str, default=None,
help=\’path to your trained VQGAN config. This should be a .yaml file. (only valid when taming option is enabled)\’)
# 添加参数：包含图像和文本用于学习 DALL-E 的文件夹路径
parser.add_argument(\’–image_text_folder\’, type=str, required=True,
help=\’path to your folder of images and text for learning the DALL-E\’)
# 添加参数：WebDataset 的列名，用于图像和文本
parser.add_argument(\’–wds\’, type=str, default=\’\’,
help=\’Comma separated list of WebDataset (1) image and (2) text column names. Must contain 2 values, e.g. img,cap.\’)
# 添加参数：是否截断超过最大标记长度的标题
parser.add_argument(\’–truncate_captions\’, dest=\’truncate_captions\’, action=\’store_true\’,
help=\’Captions passed in which exceed the max token length will be truncated if this is set.\’)
# 添加参数：随机调整裁剪的较低比率
parser.add_argument(\’–random_resize_crop_lower_ratio\’, dest=\’resize_ratio\’, type=float, default=0.75,
help=\’Random resized crop lower ratio\’)
# 添加参数：是否使用中文
parser.add_argument(\’–chinese\’, dest=\’chinese\’, action=\’store_true\’)
# 添加参数：是否启用 taming 模式
parser.add_argument(\’–taming\’, dest=\’taming\’, action=\’store_true\’)
# 添加参数：是否使用 Hugging Face Tokenizer
parser.add_argument(\’–hug\’, dest=\’hug\’, action=\’store_true\’)
# 添加参数：BPE json 文件路径
parser.add_argument(\’–bpe_path\’, type=str,
help=\’path to your BPE json file\’)
# 添加参数：DALL-E 输出文件名
parser.add_argument(\’–dalle_output_file_name\’, type=str, default=\”dalle\”,
help=\’output_file_name\’)
# 添加参数：启用 DeepSpeed 16 位精度
parser.add_argument(\’–fp16\’, action=\’store_true\’,
help=\'(experimental) – Enable DeepSpeed 16 bit precision. Reduces VRAM.\’)
# 添加参数：启用 Apex \”O1\” 自动混合精度
parser.add_argument(\’–amp\’, action=\’store_true\’,
help=\’Apex \”O1\” automatic mixed precision. More stable than 16 bit precision. Can\\\’t be used in conjunction with deepspeed zero stages 1-3.\’)
# 添加参数：W&B 保存结果时使用的名称
parser.add_argument(\’–wandb_name\’, default=\’dalle_train_transformer\’,
help=\’Name W&B will use when saving results.\\ne.g. `–wandb_name \”coco2017-full-sparse\”`\’)
# 添加参数：W&B 日志记录的团队/实体名称
parser.add_argument(\’–wandb_entity\’, default=None,
help=\'(optional) Name of W&B team/entity to log to.\’)
# 添加参数：稳定 softmax，防止在 softmax 过程中值变得过大
parser.add_argument(\’–stable_softmax\’, dest=\’stable_softmax\’, action=\’store_true\’,
help=\’Prevent values from becoming too large during softmax. Helps with stability in fp16 and Mixture of Quantization training.\’)
# 分布式训练参数
parser = distributed_utils.wrap_arg_parser(parser)
# 训练设置参数
train_group = parser.add_argument_group(\’Training settings\’)
# 添加参数：是否启用 FLOPS 分析
train_group.add_argument(\’–flops_profiler\’, dest=\’flops_profiler\’, action=\’store_true\’, help=\’Exits after printing detailed flops/runtime analysis of forward/backward\’)
# 添加参数：训练轮数
train_group.add_argument(\’–epochs\’, default=20, type=int, help=\’Number of epochs\’)
# 添加一个参数到训练组，保存每n步一个检查点
train_group.add_argument(\’–save_every_n_steps\’, default=1000, type=int, help=\’Save a checkpoint every n steps\’)
# 添加一个参数到训练组，保留n个检查点，如果检查点数量超过n则删除旧的deepspeed检查点（谨慎操作）
train_group.add_argument(\’–keep_n_checkpoints\’, default=None, type=int, help=\'(Careful) Deletes old deepspeed checkpoints if there are more than n\’)
# 添加一个参数到训练组，批量大小
train_group.add_argument(\’–batch_size\’, default=4, type=int, help=\’Batch size\’)
# 添加一个参数到训练组，GA步数，每次迭代中跨步累积梯度的步数。仅适用于DeepSpeed。
train_group.add_argument(\’–ga_steps\’, default=1, type=int, help=\’Number of steps to accumulate gradients across per each iteration. DeepSpeed only.\’)
# 添加一个参数到训练组，学习率
train_group.add_argument(\’–learning_rate\’, default=3e-4, type=float, help=\’Learning rate\’)
# 添加一个参数到训练组，梯度规范化裁剪
train_group.add_argument(\’–clip_grad_norm\’, default=0.5, type=float, help=\’Clip gradient norm\’)
# 添加一个参数到训练组，学习率衰减
train_group.add_argument(\’–lr_decay\’, dest=\’lr_decay\’, action=\’store_true\’)
# 创建模型设置参数组
model_group = parser.add_argument_group(\’Model settings\’)
# 添加一个参数到模型设置组，模型维度
model_group.add_argument(\’–dim\’, default=512, type=int, help=\’Model dimension\’)
# 添加一个参数到模型设置组，文本序列长度
model_group.add_argument(\’–text_seq_len\’, default=256, type=int, help=\’Text sequence length\’)
# 添加一个参数到模型设置组，模型深度
model_group.add_argument(\’–depth\’, default=2, type=int, help=\’Model depth\’)
# 添加一个参数到模型设置组，模型头数
model_group.add_argument(\’–heads\’, default=8, type=int, help=\’Model number of heads\’)
# 添加一个参数到模型设置组，模型头维度
model_group.add_argument(\’–dim_head\’, default=64, type=int, help=\’Model head dimension\’)
# 添加一个参数到训练组，前馈层dropout
train_group.add_argument(\’–ff_dropout\’, default=0.0, type=float, help=\’Feed forward dropout.\’)
# 添加一个参数到训练组，注意力dropout
train_group.add_argument(\’–attn_dropout\’, default=0.0, type=float, help=\’Feed forward dropout.\’)
# 添加一个参数到模型设置组，可逆性
model_group.add_argument(\’–reversible\’, dest=\’reversible\’, action=\’store_true\’)
# 添加一个参数到模型设置组，图像损失权重
model_group.add_argument(\’–loss_img_weight\’, default=7, type=int, help=\’Image loss weight\’)
# 添加一个参数到模型设置组，注意力类型
model_group.add_argument(\’–attn_types\’, default=\’full\’, type=str, help=\’comma separated list of attention types. attention type can be: full or sparse or axial_row or axial_col or conv_like.\’)
# 添加一个参数到模型设置组，使用移位标记特性
model_group.add_argument(\’–shift_tokens\’, help=\’Use the shift tokens feature\’, action=\’store_true\’)
# 添加一个参数到模型设置组，使用旋转嵌入
model_group.add_argument(\’–rotary_emb\’, help=\’Use rotary embeddings\’, action=\’store_true\’)
# 添加一个参数到模型设置组，共享注意力层ID
model_group.add_argument(\’–shared_attn_ids\’, default=None, type=str, help=\’Comma separated list of shared attention layer ids. Default: sharing is disabled\’)
# 添加一个参数到模型设置组，共享前馈层ID
model_group.add_argument(\’–shared_ff_ids\’, default=None, type=str, help=\’Comma separated list of shared feed forward layer ids. Default: sharing is disabled\’)
# 添加一个参数到模型设置组，共享输入和输出嵌入
model_group.add_argument(\’–share_input_output_emb\’, help=\’Share input and output embeddings\’, action=\’store_true\’)
# 解析命令行参数
args = parser.parse_args()
# 辅助函数
# 检查值是否存在
def exists(val):
return val is not None
# 获取可训练参数
def get_trainable_params(model):
return [params for params in model.parameters() if params.requires_grad]
# 将检查点路径转换为带有插入标签的目录
def cp_path_to_dir(cp_path, tag):
\”\”\”Convert a checkpoint path to a directory with `tag` inserted.
If `cp_path` is already a directory, return it unchanged.
\”\”\”
if not isinstance(cp_path, Path):
cp_path = Path(cp_path)
if cp_path.is_dir():
return cp_path
path_sans_extension = cp_path.parent / cp_path.stem
cp_dir = Path(f\'{path_sans_extension}-{tag}-cp\’)
return cp_dir
# 常量
# 图像文本列
WEBDATASET_IMAGE_TEXT_COLUMNS = tuple(args.wds.split(\’,\’))
ENABLE_WEBDATASET = True if len(WEBDATASET_IMAGE_TEXT_COLUMNS) == 2 else False
# DALLE输出文件名
DALLE_OUTPUT_FILE_NAME = args.dalle_output_file_name + \”.pt\”
# VAE路径
VAE_PATH = args.vae_path
VQGAN_MODEL_PATH = args.vqgan_model_path
VQGAN_CONFIG_PATH = args.vqgan_config_path
DALLE_PATH = args.dalle_path
RESUME = exists(DALLE_PATH)
# 训练周期
EPOCHS = args.epochs
BATCH_SIZE = args.batch_size
# 学习率
LEARNING_RATE = args.learning_rate
GRAD_CLIP_NORM = args.clip_grad_norm
LR_DECAY = args.lr_decay
SAVE_EVERY_N_STEPS = args.save_every_n_steps
KEEP_N_CHECKPOINTS = args.keep_n_checkpoints
# 模型维度
MODEL_DIM = args.dim
TEXT_SEQ_LEN = args.text_seq_len
DEPTH = args.depth
HEADS = args.heads
DIM_HEAD = args.dim_head
REVERSIBLE = args.reversible
# 从参数中获取损失图像权重
LOSS_IMG_WEIGHT = args.loss_img_weight
# 从参数中获取前馈神经网络的丢弃率
FF_DROPOUT = args.ff_dropout
# 从参数中获取注意力机制的丢弃率
ATTN_DROPOUT = args.attn_dropout
# 从参数中获取是否使用稳定的 softmax 函数
STABLE = args.stable_softmax
# 从参数中获取是否移动标记
SHIFT_TOKENS = args.shift_tokens
# 从参数中获取是否使用旋转嵌入
ROTARY_EMB = args.rotary_emb
# 从参数中获取注意力类型并转换为元组
ATTN_TYPES = tuple(args.attn_types.split(\’,\’))
# 如果存在共享的注意力 ID，则从参数中获取并转换为元组，否则为 None
SHARED_ATTN_IDS = tuple(args.shared_attn_ids.split(\’,\’)) if exists(args.shared_attn_ids) else None
# 如果存在共享的前馈神经网络 ID，则从参数中获取并转换为元组，否则为 None
SHARED_FF_IDS = tuple(args.shared_ff_ids.split(\’,\’)) if exists(args.shared_ff_ids) else None
# 从参数中获取是否共享输入输出嵌入
SHARE_INPUT_OUTPUT_EMB = args.share_input_output_emb
# 定义 DeepSpeed 检查点辅助文件名
DEEPSPEED_CP_AUX_FILENAME = \’auxiliary.pt\’
# 如果未启用 WebDataset
if not ENABLE_WEBDATASET:
# 如果指定的图像文本文件夹不存在，则抛出异常
assert Path(args.image_text_folder).exists(), f\’The path {args.image_text_folder} was not found.\’
# 如果启用了 WebDataset
else:
# 如果图像文本文件夹是一个目录
if Path(args.image_text_folder).is_dir():
# 获取目录下所有的 .tar 文件路径
DATASET = [str(p) for p in Path(args.image_text_folder).glob(\”**/*\”) if \”.tar\” in str(p).lower()] # .name
# 如果找到的 .tar 文件数量为 0，则抛出异常
assert len(DATASET) > 0, \’The directory ({}) does not contain any WebDataset/.tar files.\’.format(args.image_text_folder)
print(\’Found {} WebDataset .tar(.gz) file(s) under given path {}!\’.format(len(DATASET), args.image_text_folder))
# 如果图像文本文件夹是一个 http(s) 链接
elif (\’http://\’ in args.image_text_folder.lower()) | (\’https://\’ in args.image_text_folder.lower()):
# 设置 DATASET 为 http(s) 链接
DATASET = f\”pipe:curl -L -s {args.image_text_folder} || true\”
print(\’Found {} http(s) link under given path!\’.format(len(DATASET), args.image_text_folder))
# 如果图像文本文件夹是一个 Google Cloud Storage (GCS) 链接
elif \’gs://\’ in args.image_text_folder.lower():
# 设置 DATASET 为 GCS 链接
DATASET = f\”pipe:gsutil cat {args.image_text_folder} || true\”
print(\’Found {} GCS link under given path!\’.format(len(DATASET), args.image_text_folder))
# 如果图像文本文件夹包含 .tar 文件
elif \’.tar\’ in args.image_text_folder:
# 设置 DATASET 为图像文本文件夹路径
DATASET = args.image_text_folder
print(\’Found WebDataset .tar(.gz) file under given path {}!\’.format(args.image_text_folder))
else:
# 抛出异常，未提供文件夹、.tar(.gz) 文件或指向 .tar 文件的 URL
raise Exception(\’No folder, no .tar(.gz) and no url pointing to tar files provided under {}.\’.format(args.image_text_folder))
# 初始化分布式后端
distr_backend = distributed_utils.set_backend_from_args(args)
distr_backend.initialize()
# 检查是否使用 DeepSpeed
using_deepspeed = distributed_utils.using_backend(distributed_utils.DeepSpeedBackend)
# 检查当前进程是否为根进程
is_root = distr_backend.is_root_worker()
# 分词器
if exists(args.bpe_path):
# 根据 BPE 路径选择分词器类
klass = HugTokenizer if args.hug else YttmTokenizer
tokenizer = klass(args.bpe_path)
elif args.chinese:
# 如果是中文文本，则使用中文分词器
tokenizer = ChineseTokenizer()
# 重建 VAE
if RESUME:
# 获取 DALL-E 模型路径
dalle_path = Path(DALLE_PATH)
# 如果使用 DeepSpeed，则获取 DeepSpeed 检查点目录
if using_deepspeed:
cp_dir = cp_path_to_dir(dalle_path, \’ds\’)
# 检查 DeepSpeed 检查点目录是否存在
assert cp_dir.is_dir(), f\’DeepSpeed checkpoint directory {cp_dir} not found\’
dalle_path = cp_dir / DEEPSPEED_CP_AUX_FILENAME
else:
# 检查 DALL-E 模型文件是否存在
assert dalle_path.exists(), \’DALL-E model file does not exist\’
# 加载模型参数、VAE 参数、权重等信息
loaded_obj = torch.load(str(dalle_path), map_location=\’cpu\’)
dalle_params, vae_params, weights = loaded_obj[\’hparams\’], loaded_obj[\’vae_params\’], loaded_obj[\’weights\’]
opt_state = loaded_obj.get(\’opt_state\’)
scheduler_state = loaded_obj.get(\’scheduler_state\’)
# 根据 VAE 参数初始化 VAE 模型
if vae_params is not None:
vae = DiscreteVAE(**vae_params)
elif args.taming:
vae = VQGanVAE(VQGAN_MODEL_PATH, VQGAN_CONFIG_PATH)
else:
vae = OpenAIDiscreteVAE()
# 获取恢复的训练轮数
resume_epoch = loaded_obj.get(\’epoch\’, 0)
else:
# 如果存在 VAE 模型路径
if exists(VAE_PATH):
# 获取 VAE 模型路径
vae_path = Path(VAE_PATH)
# 检查 VAE 模型文件是否存在
assert vae_path.exists(), \’VAE model file does not exist\’
assert not vae_path.is_dir(), \\
(\’Cannot load VAE model from directory; please use a \’
\’standard *.pt checkpoint. \’
\’Currently, merging a DeepSpeed-partitioned VAE into a DALLE \’
\’model is not supported.\’)
# 加载 VAE 模型参数和权重
loaded_obj = torch.load(str(vae_path))
vae_params, weights = loaded_obj[\’hparams\’], loaded_obj[\’weights\’]
# 根据 VAE 参数初始化 VAE 模型，并加载权重
vae = DiscreteVAE(**vae_params)
vae.load_state_dict(weights)
else:
# 如果不是预训练模型，则打印提示信息
if is_root:
print(\’using pretrained VAE for encoding images to tokens\’)
# 初始化 VAE 参数为 None
vae_params = None
# 如果使用 Taming 模型
if args.taming:
# 使用 VQGanVAE 模型
vae = VQGanVAE(VQGAN_MODEL_PATH, VQGAN_CONFIG_PATH)
else:
# 使用 OpenAIDiscreteVAE 模型
vae = OpenAIDiscreteVAE()
# 初始化 DALL-E 参数字典
dalle_params = dict(
num_text_tokens=tokenizer.vocab_size,
text_seq_len=TEXT_SEQ_LEN,
dim=MODEL_DIM,
depth=DEPTH,
heads=HEADS,
dim_head=DIM_HEAD,
reversible=REVERSIBLE,
loss_img_weight=LOSS_IMG_WEIGHT,
attn_types=ATTN_TYPES,
ff_dropout=FF_DROPOUT,
attn_dropout=ATTN_DROPOUT,
stable=STABLE,
shift_tokens=SHIFT_TOKENS,
rotary_emb=ROTARY_EMB,
shared_attn_ids=SHARED_ATTN_IDS,
shared_ff_ids=SHARED_FF_IDS,
share_input_output_emb=SHARE_INPUT_OUTPUT_EMB,
)
# 初始化恢复训练的轮次为 0
resume_epoch = 0
# 设置图像大小为VAE的图像大小
IMAGE_SIZE = vae.image_size
# 设置通道数为VAE的通道数
CHANNELS = vae.channels
# 判断是否为透明通道
TRANSPARENT = CHANNELS == 4
# 设置图像模式为RGBA或RGB
IMAGE_MODE = \’RGBA\’ if CHANNELS == 4 else \’RGB\’
# 配置OpenAI VAE为float16s
if isinstance(vae, OpenAIDiscreteVAE) and args.fp16:
# 如果是OpenAI离散VAE并且启用了fp16，设置编码器的输出卷积为float16
vae.enc.blocks.output.conv.use_float16 = True
# 辅助函数
# 对模型的参数进行分组
def group_weight(model):
group_decay, group_no_decay = [], []
for params in model.named_parameters():
if \’transformer\’ in params[0]:
if \’bias\’ in params[0] or \’norm\’ in params[0]:
group_no_decay.append(params[1])
continue
group_decay.append(params[1])
assert len(list(model.parameters())) == len(group_decay) + len(group_no_decay)
groups = [dict(params=group_decay), dict(params=group_no_decay, weight_decay=.0)]
return groups
# 创建数据集和数据加载器
# 是否打乱数据集
is_shuffle = not distributed_utils.using_backend(distributed_utils.HorovodBackend)
# 图像预处理
imagepreproc = T.Compose([
T.Lambda(lambda img: img.convert(IMAGE_MODE) if img.mode != IMAGE_MODE else img),
T.RandomResizedCrop(IMAGE_SIZE, scale=(args.resize_ratio, 1.), ratio=(1., 1.)),
T.ToTensor(),
])
# 图像转换函数
def imagetransform(b):
return Image.open(BytesIO(b))
# 分词函数
def tokenize(s):
return tokenizer.tokenize(s.decode(\’utf-8\’), TEXT_SEQ_LEN, truncate_text=args.truncate_captions).squeeze(0)
if ENABLE_WEBDATASET:
# 设置数据集大小
DATASET_SIZE = int(1e9) # You need to set a nominal length for the Dataset in order to avoid warnings from DataLoader
myimg, mycap = WEBDATASET_IMAGE_TEXT_COLUMNS
# 图像文本映射
image_text_mapping = {
myimg: imagetransform,
mycap: tokenize
}
# 图像映射
image_mapping = {
myimg: imagepreproc
}
# 数据集过滤函数
def filter_dataset(item):
if mycap not in item:
return False
if myimg not in item:
return False
return True
# 创建WebDataset
w_dataset = wds.WebDataset(DATASET, handler=wds.warn_and_continue)
filtered_dataset = w_dataset.select(filter_dataset)
ds = filtered_dataset.map_dict(**image_text_mapping).map_dict(**image_mapping).to_tuple(mycap, myimg).batched(BATCH_SIZE / distr_backend.get_world_size(), partial=True)
else:
# 创建TextImageDataset
ds = TextImageDataset(
args.image_text_folder,
text_len=TEXT_SEQ_LEN,
image_size=IMAGE_SIZE,
transparent=TRANSPARENT,
resize_ratio=args.resize_ratio,
truncate_captions=args.truncate_captions,
tokenizer=tokenizer,
shuffle=is_shuffle,
)
assert len(ds) > 0, \’dataset is empty\’
if is_root:
if not ENABLE_WEBDATASET:
print(f\'{len(ds)} image-text pairs found for training\’)
# 数据采样器
data_sampler = None
if not is_shuffle:
data_sampler = torch.utils.data.distributed.DistributedSampler(
ds,
num_replicas=distr_backend.get_world_size(),
rank=distr_backend.get_rank()
)
# WebLoader用于WebDataset和DeepSpeed兼容性
if ENABLE_WEBDATASET:
dl = wds.WebLoader(ds, batch_size=None, shuffle=False, num_workers=4) # optionally add num_workers=2 (n) argument
number_of_batches = DATASET_SIZE // (BATCH_SIZE * distr_backend.get_world_size())
dl = dl.slice(number_of_batches)
dl.length = number_of_batches
else:
# 用于图像文本文件夹数据集的常规DataLoader
dl = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=is_shuffle, drop_last=True, sampler=data_sampler)
# 初始化DALL-E
dalle = DALLE(vae=vae, **dalle_params)
if not using_deepspeed:
if args.fp16:
# 如果启用fp16，将DALL-E设置为半精度
dalle = dalle.half()
# 将DALL-E移动到GPU
dalle = dalle.cuda()
if RESUME and not using_deepspeed:
# 如果恢复训练并且不使用DeepSpeed，加载权重
dalle.load_state_dict(weights)
# 优化器
# 创建Adam优化器
opt = Adam(get_trainable_params(dalle), lr=LEARNING_RATE)
if RESUME and opt_state:
# 如果恢复训练并且有优化器状态，加载优化器状态
opt.load_state_dict(opt_state)
# 调度器
scheduler = None
if LR_DECAY:
# 创建一个学习率调度器 ReduceLROnPlateau
scheduler = ReduceLROnPlateau(
opt, # 传入优化器
mode=\”min\”, # 设置模式为最小化
factor=0.5, # 学习率调整因子
patience=10, # 忍耐次数
cooldown=10, # 冷却时间
min_lr=1e-6, # 最小学习率
verbose=True, # 是否打印信息
)
# 如果 RESUME 为真且存在学习率调度器状态
if RESUME and scheduler_state:
# 加载学习率调度器状态
scheduler.load_state_dict(scheduler_state)
# 实验跟踪器
# 如果是根节点
if is_root:
# 定义模型配置字典
model_config = dict(
depth=DEPTH,
heads=HEADS,
dim_head=DIM_HEAD
)
# 初始化 wandb 实验
run = wandb.init(
project=args.wandb_name,
entity=args.wandb_entity,
resume=False,
config=model_config,
)
# 分发
# 检查批量大小是否符合要求
distr_backend.check_batch_size(BATCH_SIZE)
# 配置 DeepSpeed
deepspeed_config = {
\’train_batch_size\’: BATCH_SIZE,
\’gradient_accumulation_steps\’: args.ga_steps,
\’gradient_clipping\’: GRAD_CLIP_NORM,
\’fp16\’: {
\’enabled\’: args.fp16,
},
\’amp\’: {
\’enabled\’: args.amp,
\’opt_level\’: \’O1\’,
},
\”flops_profiler\”: {
\”enabled\”: args.flops_profiler,
\”profile_step\”: 200,
\”module_depth\”: -1,
\”top_modules\”: 1,
\”detailed\”: True,
\”output_file\”: None # TODO 无法使其工作。
},
}
# 如果 DeepSpeed 配置中的零优化阶段大于等于 2
if deepspeed_config.get(\’zero_optimization\’, {}).get(\’stage\’, 0) >= 2:
print(f\”Checkpoints made with DeepSpeed ZeRO Stages 2 and 3 will be stored in deepspeed checkpoint folder\”)
print(f\”As such, they will require DeepSpeed as a dependency in order to resume from or generate with.\”)
print(\”See the deespeed conversion script for details on how to convert your ZeRO stage 2/3 checkpoint to a single file.\”)
print(\”If using a single GPU, consider running with apex automatic mixed precision instead for a similar speedup to ZeRO.\”)
time.sleep(2)
# 分发模型、优化器、数据加载器和调度器
(distr_dalle, distr_opt, distr_dl, distr_scheduler) = distr_backend.distribute(
args=args,
model=dalle,
optimizer=opt,
model_parameters=get_trainable_params(dalle),
training_data=(
(None if ENABLE_WEBDATASET else ds)
if using_deepspeed
else dl
),
# 不将 LR 调度器传递给 DeepSpeed，以便手动推进
lr_scheduler=scheduler if LR_DECAY and not using_deepspeed else None,
config_params=deepspeed_config,
)
# 优先使用 `deepspeed_config` 中的调度器。
# 如果启用了 LR 衰减且分发调度器为 None，则使用全局调度器
if LR_DECAY and distr_scheduler is None:
distr_scheduler = scheduler
# 如果正在使用 DeepSpeed 并且启用了 fp16
avoid_model_calls = using_deepspeed and args.fp16
# 如果恢复训练并且正在使用 DeepSpeed
if RESUME and using_deepspeed:
distr_dalle.load_checkpoint(str(cp_dir))
# 保存模型
def save_model(path, epoch=0):
save_obj = {
\’hparams\’: dalle_params,
\’vae_params\’: vae_params,
\’epoch\’: epoch,
\’version\’: __version__,
\’vae_class_name\’: vae.__class__.__name__
}
# 如果使用 DeepSpeed
if using_deepspeed:
cp_dir = cp_path_to_dir(path, \’ds\’)
# 如果保留的检查点数量不为 None 且为根节点
if KEEP_N_CHECKPOINTS is not None and is_root:
checkpoints = sorted(glob(str(cp_dir / \”global*\”)), key=os.path.getmtime, reverse=True)
for checkpoint in checkpoints[KEEP_N_CHECKPOINTS:]:
shutil.rmtree(checkpoint)
# 保存 DeepSpeed 检查点
distr_dalle.save_checkpoint(cp_dir, client_state=save_obj)
if not is_root:
return
# 保存辅助值以便重用标准加载程序
save_obj = {
**save_obj,
# 保存一个无意义的值，指导用户进一步帮助
\’weights\’: (
\’To get a working standard checkpoint, \’
\’look into consolidating DeepSpeed checkpoints.\’
),
}
torch.save(save_obj, str(cp_dir / DEEPSPEED_CP_AUX_FILENAME))
if deepspeed_config.get(\’zero_optimization\’, {}).get(\’stage\’, 0) >= 2: # 参见 https://github.com/lucidrains/DALLE-pytorch/wiki/DeepSpeed-Checkpoints
return
if not is_root:
return
save_obj = {
**save_obj,
\’weights\’: dalle.state_dict(),
\’opt_state\’: opt.state_dict(),
\’scheduler_state\’: (scheduler.state_dict() if scheduler else None)
}
torch.save(save_obj, path)
# 保存模型配置和路径为 artifact
def save_artifact(model_config, model_path, name = \’trained-dalle\’):
model_artifact = wandb.Artifact(name, type=\’model\’, metadata=dict(model_config))
model_artifact.add_file(model_path)
run.log_artifact(model_artifact)
# 训练
# 在训练开始之前保存一个检查点，以便在配置错误时提前失败
# 参考 https://github.com/lucidrains/DALLE-pytorch/wiki/DeepSpeed-Checkpoints
# 保存模型
save_model(DALLE_OUTPUT_FILE_NAME, epoch=resume_epoch)
# 循环每个 epoch
for epoch in range(resume_epoch, EPOCHS):
# 如果有数据采样器，则设置当前 epoch
if data_sampler:
data_sampler.set_epoch(epoch)
# 遍历数据加载器
for i, (text, images) in enumerate((dl if ENABLE_WEBDATASET else distr_dl)):
# 每隔 10 步打印时间
if i % 10 == 0 and is_root:
t = time.time()
# 如果启用了 fp16，将图像转换为半精度
if args.fp16:
images = images.half()
# 将文本和图像移动到 GPU
text, images = map(lambda t: t.cuda(), (text, images))
# 计算损失
loss = distr_dalle(text, images, return_loss=True)
# 如果使用了 DeepSpeed
if using_deepspeed:
distr_dalle.backward(loss)
distr_dalle.step()
# 梯度在步骤后会自动清零
else:
loss.backward()
clip_grad_norm_(distr_dalle.parameters(), GRAD_CLIP_NORM)
distr_opt.step()
distr_opt.zero_grad()
# 计算集体损失，取平均值
avg_loss = distr_backend.average_all(loss)
log = {}
# 每隔 10 步打印损失
if i % 10 == 0 and is_root:
print(epoch, i, f\’loss – {avg_loss.item()}\’)
log = {
**log,
\’epoch\’: epoch,
\’iter\’: i,
\’loss\’: avg_loss.item()
}
# 每隔 SAVE_EVERY_N_STEPS 步保存模型
if i % SAVE_EVERY_N_STEPS == 0:
save_model(DALLE_OUTPUT_FILE_NAME, epoch=epoch)
# 每隔 100 步处理图像和日志
if i % 100 == 0 and is_root:
sample_text = text[:1]
token_list = sample_text.masked_select(sample_text != 0).tolist()
decoded_text = tokenizer.decode(token_list)
if not avoid_model_calls:
# 避免 CUDA 索引错误
image = dalle.generate_images(text[:1], filter_thres=0.9) # 使用 0.9 的 topk 抽样
if not avoid_model_calls:
log[\’image\’] = wandb.Image(image, caption=decoded_text)
# 每隔 10 步打印每秒样本数
if i % 10 == 9 and is_root:
sample_per_sec = BATCH_SIZE * 10 / (time.time() – t)
log[\”sample_per_sec\”] = sample_per_sec
print(epoch, i, f\’sample_per_sec – {sample_per_sec}\’)
# 如果达到指定步数并启用了 FLOPS ��析器，则停止训练
if i == 201 and args.flops_profiler:
raise StopIteration(\”Profiler has finished running. Stopping training early.\”)
# 如果是根节点，记录日志
if is_root:
wandb.log(log)
# 如果启用了学习率衰减，根据平均损失调整学习率
if LR_DECAY:
distr_scheduler.step(avg_loss)
# 每个 epoch 结束时保存模型
save_model(DALLE_OUTPUT_FILE_NAME, epoch=epoch)
if is_root:
# 每个 epoch 结束时将训练好的模型保存到 wandb 作为 artifact
save_artifact(model_config, DALLE_OUTPUT_FILE_NAME)
# 最后保存模型
save_model(DALLE_OUTPUT_FILE_NAME, epoch=epoch)
if is_root:
# 保存训练好的模型到 wandb，并完成 wandb 日志
wandb.save(DALLE_OUTPUT_FILE_NAME)
save_artifact(model_config, DALLE_OUTPUT_FILE_NAME)
wandb.finish()

#以上关于Lucidrains 系列项目源码解析（十七）的相关内容来源网络仅供参考，相关信息请以官方公告为准！

原创文章，作者：CSDN，如若转载，请注明出处：https://www.sudun.com/ask/93266.html

Lucidrains 系列项目源码解析（十七）

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\reversible.py

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\transformer.py

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\vae.py

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\version.py

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\__init__.py

.\\lucidrains\\DALLE-pytorch\\generate.py

DALL-E in Pytorch

Appreciation

Status

Install

Usage

OpenAI’s Pretrained VAE

Taming Transformer’s Pretrained VQGAN VAE

Adjust text conditioning strength

Ranking the generations

Scaling depth

Sparse Attention

Deepspeed Sparse Attention

Training

VAE

DALL-E Training

Training using an Image-Text-Folder

Chinese

Citations

.\\lucidrains\\DALLE-pytorch\\setup.py

.\\lucidrains\\DALLE-pytorch\\train_dalle.py

相关推荐

ChatGPT 中的“GPT”是什么意思？三个词，测出你的英语水平！

网络地址转换nat原理（网络地址转换nat的特点）

如何找到最快解析速度的DNS，怎么解析dns地址

中国龙模型3D打印图纸 STL格式

发表回复

Please sign in

.\\lucidrains\\DALLE-pytorch\\dalle_pytorch\\init.py