Source code for xpmir.text.tokenizers

from abc import ABC, abstractmethod
from typing import List, NamedTuple, Optional, TypeVar, Generic
from attr import define
import re

import torch

from experimaestro import Config
from xpmir.text.utils import lengthToMask
from xpmir.learning.optim import ModuleInitOptions
from xpmir.utils.utils import Initializable
from xpmir.utils.torch import to_device


class TokenizedTexts(NamedTuple):
    """Tokenized texts output"""

    tokens: List[List[str]]
    """The list of tokens"""

    ids: torch.LongTensor
    """A matrix containing the token IDs"""

    lens: List[int]
    """the lengths of each text (in tokens)"""

    mask: Optional[torch.LongTensor]
    """The mask for the ids matrix"""

    token_type_ids: Optional[torch.LongTensor] = None
    """Type of each token"""

    def to(self, device: torch.device):
        if device is self.ids.device:
            return self

        return TokenizedTexts(
            self.tokens,
            self.ids.to(device),
            self.lens,
            to_device(self.mask, device),
            to_device(self.token_type_ids, device),
        )


[docs]class Tokenizer(Config):
    """
    Represents a vocabulary and a tokenization method

    **Deprecated**: Use TokenizerBase instead
    """

    def tokenize(self, text):
        """
        Meant to be overwritten in to provide vocab-specific tokenization when necessary
        e.g., BERT's WordPiece tokenization
        """
        text = text.lower()
        text = re.sub(r"[^a-z0-9]", " ", text)
        return text.split()

    def pad_sequences(self, tokensList: List[List[int]], batch_first=True, maxlen=0):
        padding_value = 0
        lens = [len(s) for s in tokensList]
        if maxlen is None:
            maxlen = max(lens)
        else:
            maxlen = min(maxlen or 0, max(lens))

        if batch_first:
            out_tensor = torch.full(
                (len(tokensList), maxlen), padding_value, dtype=torch.long
            )
            for i, tokens in enumerate(tokensList):
                out_tensor[i, : lens[i], ...] = torch.LongTensor(tokens[:maxlen])
        else:
            out_tensor = torch.full(
                (maxlen, len(tokensList)), padding_value, dtype=torch.long
            )
            for i, tokens in enumerate(tokensList):
                out_tensor[: lens[i], i, ...] = tokens[:maxlen]

        return out_tensor.to(self._dummy_params.device), lens

[docs]    def batch_tokenize(
        self, texts: List[str], batch_first=True, maxlen=None, mask=False
    ) -> TokenizedTexts:
        """
        Returns tokenized texts

        Arguments:
            mask: Whether a mask should be computed
        """
        toks = [self.tokenize(text) for text in texts]
        tokids, lens = self.pad_sequences(
            [[self.tok2id(t) for t in tok] for tok in toks],
            batch_first=batch_first,
            maxlen=maxlen,
        )

        _mask = lengthToMask(torch.LongTensor(lens)) if mask else None

        return TokenizedTexts(toks, tokids, lens, _mask)

    @property
    def pad_tokenid(self) -> int:
        raise NotImplementedError()

[docs]    def tok2id(self, tok: str) -> int:
        """
        Converts a token to an integer id
        """
        raise NotImplementedError()

[docs]    def id2tok(self, idx: int) -> str:
        """
        Converts an integer id to a token
        """
        raise NotImplementedError()

[docs]    def lexicon_size(self) -> int:
        """
        Returns the number of items in the lexicon
        """
        raise NotImplementedError()


TokenizerInput = TypeVar("TokenizerInput")
TokenizerOutput = TypeVar("TokenizerOutput", bound=TokenizedTexts)


@define
class TokenizerOptions:
    max_length: Optional[int] = None
    return_mask: Optional[bool] = True
    return_length: Optional[bool] = True


[docs]class TokenizerBase(
    Config, Initializable, Generic[TokenizerInput, TokenizerOutput], ABC
):
    """Base tokenizer"""

    def __initialize__(self, options: ModuleInitOptions):
        super().__initialize__(options)

    @abstractmethod
    def tokenize(
        self, inputs: TokenizerInput, options: Optional[TokenizerOptions] = None
    ) -> TokenizerOutput:
        """Encodes the inputs"""
        ...

    @abstractmethod
    def vocabulary_size(self) -> int:
        """
        Returns the number of tokens
        """
        ...

    @abstractmethod
    def tok2id(self, tok: str) -> int:
        """
        Converts a token to an integer id
        """
        ...

    @abstractmethod
    def id2tok(self, idx: int) -> str:
        """
        Converts an integer id to a token
        """
        ...