class MultiheadAttention(Module):
  __parameters__ = ["in_proj_weight", "q_proj_weight", "k_proj_weight", "v_proj_weight", "in_proj_bias", ]
  __buffers__ = []
  in_proj_weight : Tensor
  q_proj_weight : NoneType
  k_proj_weight : NoneType
  v_proj_weight : NoneType
  in_proj_bias : Tensor
  training : bool
  _is_full_backward_hook : NoneType
  embed_dim : int
  kdim : int
  vdim : int
  _qkv_same_embed_dim : bool
  num_heads : int
  dropout : float
  head_dim : int
  bias_k : Optional[Tensor]
  bias_v : Optional[Tensor]
  add_zero_attn : bool
  out_proj : __torch__.torch.nn.modules.linear.NonDynamicallyQuantizableLinear
  batch_first : Final[bool] = True
  def forward(self: __torch__.torch.nn.modules.activation.MultiheadAttention,
    query: Tensor,
    key: Tensor,
    value: Tensor,
    key_padding_mask: Optional[Tensor]=None,
    need_weights: bool=True,
    attn_mask: Optional[Tensor]=None) -> Tuple[Tensor, Optional[Tensor]]:
    _0 = __torch__.torch.nn.functional.multi_head_attention_forward
    _1 = annotate(List[Tensor], [])
    _2 = torch.append(_1, torch.transpose(query, 1, 0))
    _3 = torch.append(_1, torch.transpose(key, 1, 0))
    _4 = torch.append(_1, torch.transpose(value, 1, 0))
    query0, key0, value0, = _1
    _qkv_same_embed_dim = self._qkv_same_embed_dim
    if torch.__not__(_qkv_same_embed_dim):
      embed_dim = self.embed_dim
      num_heads = self.num_heads
      in_proj_weight = self.in_proj_weight
      in_proj_bias = self.in_proj_bias
      bias_k = self.bias_k
      bias_v = self.bias_v
      add_zero_attn = self.add_zero_attn
      dropout = self.dropout
      out_proj = self.out_proj
      weight = out_proj.weight
      out_proj0 = self.out_proj
      bias = out_proj0.bias
      training = self.training
      q_proj_weight = self.q_proj_weight
      k_proj_weight = self.k_proj_weight
      v_proj_weight = self.v_proj_weight
      _5 = _0(query0, key0, value0, embed_dim, num_heads, in_proj_weight, in_proj_bias, bias_k, bias_v, add_zero_attn, dropout, weight, bias, training, key_padding_mask, need_weights, attn_mask, True, q_proj_weight, k_proj_weight, v_proj_weight, None, None, )
      attn_output0, attn_output_weights0, = _5
      attn_output, attn_output_weights = attn_output0, attn_output_weights0
    else:
      embed_dim0 = self.embed_dim
      num_heads0 = self.num_heads
      in_proj_weight0 = self.in_proj_weight
      in_proj_bias0 = self.in_proj_bias
      bias_k0 = self.bias_k
      bias_v0 = self.bias_v
      add_zero_attn0 = self.add_zero_attn
      dropout0 = self.dropout
      out_proj1 = self.out_proj
      weight0 = out_proj1.weight
      out_proj2 = self.out_proj
      bias0 = out_proj2.bias
      training0 = self.training
      _6 = _0(query0, key0, value0, embed_dim0, num_heads0, in_proj_weight0, in_proj_bias0, bias_k0, bias_v0, add_zero_attn0, dropout0, weight0, bias0, training0, key_padding_mask, need_weights, attn_mask, False, None, None, None, None, None, )
      attn_output1, attn_output_weights1, = _6
      attn_output, attn_output_weights = attn_output1, attn_output_weights1
    _7 = (torch.transpose(attn_output, 1, 0), attn_output_weights)
    return _7
class ReLU(Module):
  __parameters__ = []
  __buffers__ = []
  training : bool
  _is_full_backward_hook : NoneType
  inplace : Final[bool] = False
  def forward(self: __torch__.torch.nn.modules.activation.ReLU,
    input: Tensor) -> Tensor:
    _8 = __torch__.torch.nn.functional.relu(input, False, )
    return _8
