花园设计网站推荐,十大互联网平台,河南省城市建设网站,wordpress 异常Flatten Attention
介绍#xff1a;最新注意力Flatten Attention#xff1a;聚焦的线性注意力机制构建视觉 Transformer 在将 Transformer 模型应用于视觉任务时#xff0c;自注意力机制 (Self-Attention) 的计算复杂度随序列长度的大小呈二次方关系#xff0c;给视觉任务…Flatten Attention
介绍最新注意力Flatten Attention聚焦的线性注意力机制构建视觉 Transformer 在将 Transformer 模型应用于视觉任务时自注意力机制 (Self-Attention) 的计算复杂度随序列长度的大小呈二次方关系给视觉任务的应用带来了挑战。各种各样的线性注意力机制 (Linear Attention) 的计算复杂度随序列长度的大小呈线性关系可以提供一种更有效的替代方案。线性注意力机制通过精心设计的映射函数来替代 Self-Attention 中的 Softmax 操作但是这种技术路线要么会面临比较严重的性能下降要么从映射函数中引入额外的计算开销。
本文作者提出一种聚焦线性注意力机制 (Focused Linear Attention)力求实现高效率和高表达力。作者首先分析了是什么导致了线性注意力机制性能的下降然后归结为了两个方面聚焦能力 (Focus Ability) 和特征丰富度 (Feature Diversity)然后提出一个简单而有效的映射函数和一个高效的秩恢复模块来增强自我注意的表达能力同时保持较低的计算复杂度。
原文地址FLatten Transformer: Vision Transformer using Focused Linear Attention 代码实现
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partialfrom timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from timm.models.helpers import load_pretrained
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
from timm.models.registry import register_model
from einops.layers.torch import Rearrange
import torch.utils.checkpoint as checkpoint
import numpy as np
import time
from einops import rearrangedef _cfg(url, **kwargs):return {url: url,num_classes: 1000, input_size: (3, 224, 224), pool_size: None,crop_pct: .9, interpolation: bicubic,mean: IMAGENET_DEFAULT_MEAN, std: IMAGENET_DEFAULT_STD,first_conv: patch_embed.proj, classifier: head,**kwargs}default_cfgs {cswin_224: _cfg(),cswin_384: _cfg(crop_pct1.0),}class Mlp(nn.Module):def __init__(self, in_features, hidden_featuresNone, out_featuresNone, act_layernn.GELU, drop0.):super().__init__()out_features out_features or in_featureshidden_features hidden_features or in_featuresself.fc1 nn.Linear(in_features, hidden_features)self.act act_layer()self.fc2 nn.Linear(hidden_features, out_features)self.drop nn.Dropout(drop)def forward(self, x):x self.fc1(x)x self.act(x)x self.drop(x)x self.fc2(x)x self.drop(x)return xclass LePEAttention(nn.Module):def __init__(self, dim, resolution, idx, split_size7, dim_outNone, num_heads8, attn_drop0., proj_drop0.,qk_scaleNone):super().__init__()self.dim dimself.dim_out dim_out or dimself.resolution resolutionself.split_size split_sizeself.num_heads num_headshead_dim dim // num_heads# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weightsself.scale qk_scale or head_dim ** -0.5if idx -1:H_sp, W_sp self.resolution, self.resolutionelif idx 0:H_sp, W_sp self.resolution, self.split_sizeelif idx 1:W_sp, H_sp self.resolution, self.split_sizeelse:print(ERROR MODE, idx)exit(0)self.H_sp H_spself.W_sp W_spstride 1self.get_v nn.Conv2d(dim, dim, kernel_size3, stride1, padding1, groupsdim)self.attn_drop nn.Dropout(attn_drop)def im2cswin(self, x):B, N, C x.shapeH W int(np.sqrt(N))x x.transpose(-2, -1).contiguous().view(B, C, H, W)x img2windows(x, self.H_sp, self.W_sp)x x.reshape(-1, self.H_sp * self.W_sp, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3).contiguous()return xdef get_lepe(self, x, func):B, N, C x.shapeH W int(np.sqrt(N))x x.transpose(-2, -1).contiguous().view(B, C, H, W)H_sp, W_sp self.H_sp, self.W_spx x.view(B, C, H // H_sp, H_sp, W // W_sp, W_sp)x x.permute(0, 2, 4, 1, 3, 5).contiguous().reshape(-1, C, H_sp, W_sp) ### B, C, H, Wlepe func(x) ### B, C, H, Wlepe lepe.reshape(-1, self.num_heads, C // self.num_heads, H_sp * W_sp).permute(0, 1, 3, 2).contiguous()x x.reshape(-1, self.num_heads, C // self.num_heads, self.H_sp * self.W_sp).permute(0, 1, 3, 2).contiguous()return x, lepedef forward(self, qkv):x: B L Cq, k, v qkv[0], qkv[1], qkv[2]### Img2WindowH W self.resolutionB, L, C q.shapeassert L H * W, flatten img_tokens has wrong sizeq self.im2cswin(q)k self.im2cswin(k)v, lepe self.get_lepe(v, self.get_v)q q * self.scaleattn (q k.transpose(-2, -1)) # B head N C B head C N -- B head N Nattn nn.functional.softmax(attn, dim-1, dtypeattn.dtype)attn self.attn_drop(attn)x (attn v) lepex x.transpose(1, 2).reshape(-1, self.H_sp * self.W_sp, C) # B head N N B head N C### Window2Imgx windows2img(x, self.H_sp, self.W_sp, H, W).view(B, -1, C) # B H W Creturn xclass FocusedLinearAttention(nn.Module):def __init__(self, dim, resolution, idx, split_size7, dim_outNone, num_heads8, attn_drop0., proj_drop0.,qk_scaleNone, focusing_factor3, kernel_size5):super().__init__()self.dim dimself.dim_out dim_out or dimself.resolution resolutionself.split_size split_sizeself.num_heads num_headshead_dim dim // num_heads# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights# self.scale qk_scale or head_dim ** -0.5if idx -1:H_sp, W_sp self.resolution, self.resolutionelif idx 0:H_sp, W_sp self.resolution, self.split_sizeelif idx 1:W_sp, H_sp self.resolution, self.split_sizeelse:print(ERROR MODE, idx)exit(0)self.H_sp H_spself.W_sp W_spstride 1self.get_v nn.Conv2d(dim, dim, kernel_size3, stride1, padding1, groupsdim)self.attn_drop nn.Dropout(attn_drop)self.focusing_factor focusing_factorself.dwc nn.Conv2d(in_channelshead_dim, out_channelshead_dim, kernel_sizekernel_size,groupshead_dim, paddingkernel_size // 2)self.scale nn.Parameter(torch.zeros(size(1, 1, dim)))self.positional_encoding nn.Parameter(torch.zeros(size(1, self.H_sp * self.W_sp, dim)))print(Linear Attention {}x{} f{} kernel{}.format(H_sp, W_sp, focusing_factor, kernel_size))def im2cswin(self, x):B, N, C x.shapeH W int(np.sqrt(N))x x.transpose(-2, -1).contiguous().view(B, C, H, W)x img2windows(x, self.H_sp, self.W_sp)# x x.reshape(-1, self.H_sp * self.W_sp, C).contiguous()return xdef get_lepe(self, x, func):B, N, C x.shapeH W int(np.sqrt(N))x x.transpose(-2, -1).contiguous().view(B, C, H, W)H_sp, W_sp self.H_sp, self.W_spx x.view(B, C, H // H_sp, H_sp, W // W_sp, W_sp)x x.permute(0, 2, 4, 1, 3, 5).contiguous().reshape(-1, C, H_sp, W_sp) ### B, C, H, Wlepe func(x) ### B, C, H, Wlepe lepe.reshape(-1, C // self.num_heads, H_sp * W_sp).permute(0, 2, 1).contiguous()x x.reshape(-1, C, self.H_sp * self.W_sp).permute(0, 2, 1).contiguous()return x, lepedef forward(self, qkv):x: B L Cq, k, v qkv[0], qkv[1], qkv[2]### Img2WindowH W self.resolutionB, L, C q.shapeassert L H * W, flatten img_tokens has wrong sizeq self.im2cswin(q)k self.im2cswin(k)v, lepe self.get_lepe(v, self.get_v)# q, k, v (rearrange(x, b h n c - b n (h c), hself.num_heads) for x in [q, k, v])k k self.positional_encodingfocusing_factor self.focusing_factorkernel_function nn.ReLU()scale nn.Softplus()(self.scale)q kernel_function(q) 1e-6k kernel_function(k) 1e-6q q / scalek k / scaleq_norm q.norm(dim-1, keepdimTrue)k_norm k.norm(dim-1, keepdimTrue)q q ** focusing_factork k ** focusing_factorq (q / q.norm(dim-1, keepdimTrue)) * q_normk (k / k.norm(dim-1, keepdimTrue)) * k_normq, k, v (rearrange(x, b n (h c) - (b h) n c, hself.num_heads) for x in [q, k, v])i, j, c, d q.shape[-2], k.shape[-2], k.shape[-1], v.shape[-1]z 1 / (torch.einsum(b i c, b c - b i, q, k.sum(dim1)) 1e-6)if i * j * (c d) c * d * (i j):kv torch.einsum(b j c, b j d - b c d, k, v)x torch.einsum(b i c, b c d, b i - b i d, q, kv, z)else:qk torch.einsum(b i c, b j c - b i j, q, k)x torch.einsum(b i j, b j d, b i - b i d, qk, v, z)feature_map rearrange(v, b (h w) c - b c h w, hself.H_sp, wself.W_sp)feature_map rearrange(self.dwc(feature_map), b c h w - b (h w) c)x x feature_mapx x lepex rearrange(x, (b h) n c - b n (h c), hself.num_heads)x windows2img(x, self.H_sp, self.W_sp, H, W).view(B, -1, C)return xclass CSWinBlock(nn.Module):def __init__(self, dim, reso, num_heads,split_size7, mlp_ratio4., qkv_biasFalse, qk_scaleNone,drop0., attn_drop0., drop_path0.,act_layernn.GELU, norm_layernn.LayerNorm,last_stageFalse,focusing_factor3, kernel_size5, attn_typeL):super().__init__()self.dim dimself.num_heads num_headsself.patches_resolution resoself.split_size split_sizeself.mlp_ratio mlp_ratioself.qkv nn.Linear(dim, dim * 3, biasqkv_bias)self.norm1 norm_layer(dim)if self.patches_resolution split_size:last_stage Trueif last_stage:self.branch_num 1else:self.branch_num 2self.proj nn.Linear(dim, dim)self.proj_drop nn.Dropout(drop)assert attn_type in [L, S]if attn_type L:if last_stage:self.attns nn.ModuleList([FocusedLinearAttention(dim, resolutionself.patches_resolution, idx-1,split_sizesplit_size, num_headsnum_heads, dim_outdim,qk_scaleqk_scale, attn_dropattn_drop, proj_dropdrop,focusing_factorfocusing_factor, kernel_sizekernel_size)for i in range(self.branch_num)])else:self.attns nn.ModuleList([FocusedLinearAttention(dim // 2, resolutionself.patches_resolution, idxi,split_sizesplit_size, num_headsnum_heads // 2, dim_outdim // 2,qk_scaleqk_scale, attn_dropattn_drop, proj_dropdrop,focusing_factorfocusing_factor, kernel_sizekernel_size)for i in range(self.branch_num)])else:if last_stage:self.attns nn.ModuleList([LePEAttention(dim, resolutionself.patches_resolution, idx-1,split_sizesplit_size, num_headsnum_heads, dim_outdim,qk_scaleqk_scale, attn_dropattn_drop, proj_dropdrop)for i in range(self.branch_num)])else:self.attns nn.ModuleList([LePEAttention(dim // 2, resolutionself.patches_resolution, idxi,split_sizesplit_size, num_headsnum_heads // 2, dim_outdim // 2,qk_scaleqk_scale, attn_dropattn_drop, proj_dropdrop)for i in range(self.branch_num)])mlp_hidden_dim int(dim * mlp_ratio)self.drop_path DropPath(drop_path) if drop_path 0. else nn.Identity()self.mlp Mlp(in_featuresdim, hidden_featuresmlp_hidden_dim, out_featuresdim, act_layeract_layer,dropdrop)self.norm2 norm_layer(dim)def forward(self, x):x: B, H*W, CH W self.patches_resolutionB, L, C x.shapeassert L H * W, flatten img_tokens has wrong sizeimg self.norm1(x)qkv self.qkv(img).reshape(B, -1, 3, C).permute(2, 0, 1, 3)if self.branch_num 2:x1 self.attns[0](qkv[:, :, :, :C // 2])x2 self.attns[1](qkv[:, :, :, C // 2:])attened_x torch.cat([x1, x2], dim2)else:attened_x self.attns[0](qkv)attened_x self.proj(attened_x)x x self.drop_path(attened_x)x x self.drop_path(self.mlp(self.norm2(x)))return xdef img2windows(img, H_sp, W_sp):img: B C H WB, C, H, W img.shapeimg_reshape img.view(B, C, H // H_sp, H_sp, W // W_sp, W_sp)img_perm img_reshape.permute(0, 2, 4, 3, 5, 1).contiguous().reshape(-1, H_sp * W_sp, C)return img_permdef windows2img(img_splits_hw, H_sp, W_sp, H, W):img_splits_hw: B H W CB int(img_splits_hw.shape[0] / (H * W / H_sp / W_sp))img img_splits_hw.view(B, H // H_sp, W // W_sp, H_sp, W_sp, -1)img img.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)return imgclass Merge_Block(nn.Module):def __init__(self, dim, dim_out, norm_layernn.LayerNorm):super().__init__()self.conv nn.Conv2d(dim, dim_out, 3, 2, 1)self.norm norm_layer(dim_out)def forward(self, x):B, new_HW, C x.shapeH W int(np.sqrt(new_HW))x x.transpose(-2, -1).contiguous().view(B, C, H, W)x self.conv(x)B, C x.shape[:2]x x.view(B, C, -1).transpose(-2, -1).contiguous()x self.norm(x)return xclass CSWinTransformer(nn.Module): Vision Transformer with support for patch or hybrid CNN input stagedef __init__(self, img_size224, patch_size16, in_chans3, num_classes1000, embed_dim96, depth[2, 2, 6, 2],split_size[1, 2, 7, 7], la_split_size1-2-7-7,num_heads[2, 4, 8, 16], mlp_ratio4., qkv_biasTrue, qk_scaleNone, drop_rate0., attn_drop_rate0.,drop_path_rate0., hybrid_backboneNone, norm_layernn.LayerNorm, use_chkFalse,focusing_factor3, kernel_size5, attn_typeLLLL):super().__init__()# split_size [1, 2, img_size // 32, img_size // 32]la_split_size la_split_size.split(-)self.use_chk use_chkself.num_classes num_classesself.num_features self.embed_dim embed_dim # num_features for consistency with other modelsheads num_headsself.stage1_conv_embed nn.Sequential(nn.Conv2d(in_chans, embed_dim, 7, 4, 2),Rearrange(b c h w - b (h w) c, himg_size // 4, wimg_size // 4),nn.LayerNorm(embed_dim))curr_dim embed_dimdpr [x.item() for x in torch.linspace(0, drop_path_rate, np.sum(depth))] # stochastic depth decay ruleattn_types [(attn_type[0] if attn_type[0] ! M else (L if i int(attn_type[4:]) else S)) for i in range(depth[0])]split_sizes [(int(la_split_size[0]) if attn_types[i] L else split_size[0]) for i in range(depth[0])]self.stage1 nn.ModuleList([CSWinBlock(dimcurr_dim, num_headsheads[0], resoimg_size // 4, mlp_ratiomlp_ratio,qkv_biasqkv_bias, qk_scaleqk_scale,split_sizesplit_sizes[i],dropdrop_rate, attn_dropattn_drop_rate,drop_pathdpr[i], norm_layernorm_layer,focusing_factorfocusing_factor, kernel_sizekernel_size,attn_typeattn_types[i])for i in range(depth[0])])self.merge1 Merge_Block(curr_dim, curr_dim * 2)curr_dim curr_dim * 2attn_types [(attn_type[1] if attn_type[1] ! M else (L if i int(attn_type[4:]) else S)) for i in range(depth[1])]split_sizes [(int(la_split_size[1]) if attn_types[i] L else split_size[1]) for i in range(depth[1])]self.stage2 nn.ModuleList([CSWinBlock(dimcurr_dim, num_headsheads[1], resoimg_size // 8, mlp_ratiomlp_ratio,qkv_biasqkv_bias, qk_scaleqk_scale,split_sizesplit_sizes[i],dropdrop_rate, attn_dropattn_drop_rate,drop_pathdpr[np.sum(depth[:1]) i], norm_layernorm_layer,focusing_factorfocusing_factor, kernel_sizekernel_size,attn_typeattn_types[i])for i in range(depth[1])])self.merge2 Merge_Block(curr_dim, curr_dim * 2)curr_dim curr_dim * 2attn_types [(attn_type[2] if attn_type[2] ! M else (L if i int(attn_type[4:]) else S)) for i in range(depth[2])]split_sizes [(int(la_split_size[2]) if attn_types[i] L else split_size[2]) for i in range(depth[2])]temp_stage3 []temp_stage3.extend([CSWinBlock(dimcurr_dim, num_headsheads[2], resoimg_size // 16, mlp_ratiomlp_ratio,qkv_biasqkv_bias, qk_scaleqk_scale,split_sizesplit_sizes[i],dropdrop_rate, attn_dropattn_drop_rate,drop_pathdpr[np.sum(depth[:2]) i], norm_layernorm_layer,focusing_factorfocusing_factor, kernel_sizekernel_size,attn_typeattn_types[i])for i in range(depth[2])])self.stage3 nn.ModuleList(temp_stage3)self.merge3 Merge_Block(curr_dim, curr_dim * 2)curr_dim curr_dim * 2attn_types [(attn_type[3] if attn_type[3] ! M else (L if i int(attn_type[4:]) else S)) for i in range(depth[3])]split_sizes [(int(la_split_size[3]) if attn_types[i] L else split_size[3]) for i in range(depth[3])]self.stage4 nn.ModuleList([CSWinBlock(dimcurr_dim, num_headsheads[3], resoimg_size // 32, mlp_ratiomlp_ratio,qkv_biasqkv_bias, qk_scaleqk_scale,split_sizesplit_sizes[i],dropdrop_rate, attn_dropattn_drop_rate,drop_pathdpr[np.sum(depth[:-1]) i], norm_layernorm_layer, last_stageTrue,focusing_factorfocusing_factor, kernel_sizekernel_size,attn_typeattn_types[i])for i in range(depth[-1])])self.norm norm_layer(curr_dim)# Classifier headself.head nn.Linear(curr_dim, num_classes) if num_classes 0 else nn.Identity()trunc_normal_(self.head.weight, std0.02)self.apply(self._init_weights)def _init_weights(self, m):if isinstance(m, nn.Linear):trunc_normal_(m.weight, std.02)if isinstance(m, nn.Linear) and m.bias is not None:nn.init.constant_(m.bias, 0)elif isinstance(m, (nn.LayerNorm, nn.BatchNorm2d)):nn.init.constant_(m.bias, 0)nn.init.constant_(m.weight, 1.0)torch.jit.ignoredef no_weight_decay(self):return {pos_embed, cls_token}def get_classifier(self):return self.headdef reset_classifier(self, num_classes, global_pool):if self.num_classes ! num_classes:print(reset head to, num_classes)self.num_classes num_classesself.head nn.Linear(self.out_dim, num_classes) if num_classes 0 else nn.Identity()self.head self.head.cuda()trunc_normal_(self.head.weight, std.02)if self.head.bias is not None:nn.init.constant_(self.head.bias, 0)def forward_features(self, x):B x.shape[0]x self.stage1_conv_embed(x)for blk in self.stage1:if self.use_chk:x checkpoint.checkpoint(blk, x)else:x blk(x)for pre, blocks in zip([self.merge1, self.merge2, self.merge3],[self.stage2, self.stage3, self.stage4]):x pre(x)for blk in blocks:if self.use_chk:x checkpoint.checkpoint(blk, x)else:x blk(x)x self.norm(x)return torch.mean(x, dim1)def forward(self, x):x self.forward_features(x)x self.head(x)return xdef _conv_filter(state_dict, patch_size16): convert patch embedding weight from manual patchify linear proj to convout_dict {}for k, v in state_dict.items():if patch_embed.proj.weight in k:v v.reshape((v.shape[0], 3, patch_size, patch_size))out_dict[k] vreturn out_dict### 224 modelsdef FLatten_CSWin_64_24181_tiny_224(pretrainedFalse, **kwargs):model CSWinTransformer(patch_size4, embed_dim64, depth[2, 4, 18, 1],split_size[1, 2, 7, 7], num_heads[2, 4, 8, 16], mlp_ratio4., **kwargs)model.default_cfg default_cfgs[cswin_224]return modeldef FLatten_CSWin_64_24322_small_224(pretrainedFalse, **kwargs):model CSWinTransformer(patch_size4, embed_dim64, depth[2, 4, 32, 2],split_size[1, 2, 7, 7], num_heads[2, 4, 8, 16], mlp_ratio4., **kwargs)model.default_cfg default_cfgs[cswin_224]return modeldef FLatten_CSWin_96_36292_base_224(pretrainedFalse, **kwargs):model CSWinTransformer(patch_size4, embed_dim96, depth[3, 6, 29, 2],split_size[1, 2, 7, 7], num_heads[4, 8, 16, 32], mlp_ratio4., **kwargs)model.default_cfg default_cfgs[cswin_224]return model### 384 modelsdef FLatten_CSWin_96_36292_base_384(pretrainedFalse, **kwargs):model CSWinTransformer(patch_size4, embed_dim96, depth[3, 6, 29, 2],split_size[1, 2, 12, 12], num_heads[4, 8, 16, 32], mlp_ratio4., **kwargs)model.default_cfg default_cfgs[cswin_384]return model