Source code for xpmir.text.encoders

from abc import ABC, abstractmethod
from typing import Generic, List, Tuple, TypeVar, Union, Optional, Callable
import sys

from attrs import define
from experimaestro import Param
import torch
import torch.nn as nn

from xpmir.learning.optim import Module

from xpmir.utils.utils import EasyLogger
from .tokenizers import (
    Tokenizer,
    TokenizedTexts,
    TokenizerBase,
    TokenizerOutput,
    TokenizerOptions,
)

T = TypeVar("T")


[docs]class Encoder(Module, EasyLogger, ABC): """Base class for all word and text encoders""" def __initialize__(self, options): # Easy and hacky way to get the device super().__initialize__(options) self._dummy_params = nn.Parameter(torch.Tensor()) def static(self): return True @property def device(self): return self._dummy_params.device
@define class TokensEncoderOutput: """Output representation for text encoder""" tokenized: TokenizedTexts """Tokenized texts""" value: torch.Tensor """The encoder output"""
[docs]class TokensEncoder(Tokenizer, Encoder): """(deprecated) Represent a text as a sequence of token representations""" def enc_query_doc( self, queries: List[str], documents: List[str], d_maxlen=None, q_maxlen=None ): """ Returns encoded versions of the query and document from two list (same size) of queries and documents May be overwritten in subclass to provide contextualized representation, e.g. joinly modeling query and document representations in BERT. """ tokenized_queries = self.batch_tokenize(queries, maxlen=q_maxlen) tokenized_documents = self.batch_tokenize(documents, maxlen=d_maxlen) return ( tokenized_queries, self(tokenized_queries), tokenized_documents, self(tokenized_documents), )
[docs] def forward(self, tokenized: TokenizedTexts): """ Returns embeddings for the tokenized texts. tokenized: tokenized texts """ raise NotImplementedError()
def emb_views(self) -> int: """ Returns how many "views" are returned by the embedding layer. Most have 1, but sometimes it's useful to return multiple, e.g., BERT's multiple layers """ return 1 def dim(self) -> int: """ Returns the number of dimensions of the embedding """ raise NotImplementedError(f"for {self.__class__}") def static(self) -> bool: """ Returns True if the representations are static, i.e., not trained. Otherwise False. This allows models to know when caching is appropriate. """ return True def maxtokens(self) -> int: """Maximum number of tokens that can be processed""" return sys.maxsize
LegacyEncoderInput = Union[List[str], List[Tuple[str, str]], List[Tuple[str, str, str]]] InputType = TypeVar("InputType") EncoderOutput = TypeVar("EncoderOutput")
[docs]class TextEncoderBase(Encoder, Generic[InputType, EncoderOutput]): """Base class for all text encoders""" __call__: Callable[Tuple["TextEncoderBase", List[InputType]], EncoderOutput] @abstractmethod def forward(self, texts: List[InputType]) -> EncoderOutput: raise NotImplementedError() @property @abstractmethod def dimension(self) -> int: """Returns the dimension of the output space""" raise NotImplementedError() def max_tokens(self): """Returns the maximum number of tokens this encoder can process""" return sys.maxsize
[docs]class TextEncoder(TextEncoderBase[str, torch.Tensor]): """Encodes a text into a vector .. deprecated:: 1.3 Use TextEncoderBase directly """ pass
[docs]class DualTextEncoder(TextEncoderBase[Tuple[str, str], torch.Tensor]): """Encodes a pair of text into a vector .. deprecated:: 1.3 Use TextEncoderBase directly """ pass
[docs]class TripletTextEncoder(TextEncoderBase[Tuple[str, str, str], torch.Tensor]): """Encodes a triplet of text into a vector .. deprecated:: 1.3 Use TextEncoderBase directly This is used in models such as DuoBERT where we encode (query, positive, negative) triplets. """ pass
# --- Generic tokenized text encoders @define class RepresentationOutput: value: torch.Tensor """An arbitrary representation""" @define class TokensRepresentationOutput(RepresentationOutput): """A 3D tensor (batch x tokens x dimension)""" tokenized: TokenizedTexts """Tokenized texts""" @define class TextsRepresentationOutput(RepresentationOutput): """Value is atensor representing full texts (batch x dimension)""" tokenized: TokenizedTexts """Tokenized texts"""
[docs]class TokenizedEncoder(Encoder, Generic[EncoderOutput, TokenizerOutput]): """Encodes a tokenized text into a vector""" @abstractmethod def forward(self, inputs: TokenizerOutput) -> EncoderOutput: pass @property def max_length(self): """Returns the maximum length that the model can process""" return sys.maxsize
[docs]class TokenizedTextEncoderBase(TextEncoderBase[InputType, EncoderOutput]): @abstractmethod def forward( self, inputs: List[InputType], options: Optional[TokenizerOptions] = None ) -> EncoderOutput: ...
[docs]class TokenizedTextEncoder( TokenizedTextEncoderBase[InputType, EncoderOutput], Generic[InputType, EncoderOutput, TokenizerOutput], ): """Encodes a tokenizer input into a vector This pipelines two objects: 1. A tokenizer that segments the text; 2. An encoder that returns a representation of the tokens in a vector space """ tokenizer: Param[TokenizerBase[InputType, TokenizerOutput]] encoder: Param[TokenizedEncoder[TokenizerOutput, EncoderOutput]] def __initialize__(self, options): super().__initialize__(options) self.tokenizer.initialize(options) self.encoder.initialize(options) def forward( self, inputs: List[InputType], options: Optional[TokenizerOptions] = None ) -> EncoderOutput: options = options or TokenizerOptions() options.max_length = min( self.encoder.max_length, options.max_length or sys.maxsize ) tokenized = self.tokenizer.tokenize(inputs, options) return self.encoder(tokenized) def static(self): """Whether embeddings parameters are learnable""" return self.encoder.static() @property def dimension(self): return self.encoder.dimension