import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
def new_parameter(*size):
out = nn.Parameter(torch.FloatTensor(*size))
torch.nn.init.xavier_normal_(out)
return out
class Attention(nn.Module):
def __init__(self, attention_size):
super(Attention, self).__init__()
# Adding last 1-dim makes matmul() achieve a dot-product
self.attention = new_parameter(attention_size, 1)
def forward(self, x_in):
# The first dimension of x_in is batch
# So we batch-multiply (seq, vec) X (vec, 1) => (seq, 1)
# the result is dot-products between seq pairs of vectors
print("attn",self.attention.size())
attention_score = torch.matmul(x_in, self.attention).squeeze()
attention_score = F.softmax(attention_score,dim=1).view(x_in.size(0), x_in.size(1), 1)
# attn_score is (batch,seq,1)
# x_in is (batch,seq,vec)
# torch.mul() broadcasts the last dimension to make them equal, and then does an element-wise product
scored_x = torch.mul(x_in, attention_score)
# now, sum across dim 1 to get the expected feature vector
condensed_x = torch.sum(scored_x, dim=1)
return condensed_x
def test_attention():
attn = Attention(100)
x = Variable(torch.randn(16,30,100))
attn(x).size() == (16,100)
print(attn(x))
test_attention()