Source code for xpmir.text.adapters

from typing import List

from experimaestro import Param
from datamaestro_ir.data import TextRecord
from xpmir.utils.convert import Converter

from .encoders import InputType, RepresentationOutput, TokenizedTextEncoderBase



[docs]
class MeanTextEncoder(TokenizedTextEncoderBase[InputType, RepresentationOutput]):
    """Returns the mean of the word embeddings"""

    encoder: Param[TokenizedTextEncoderBase[InputType, RepresentationOutput]]

    def __initialize__(self):
        self.encoder.__initialize__()

    def static(self):
        return self.encoder.static()

    @property
    def dimension(self):
        return self.encoder.dimension

    def forward(self, texts: List[InputType], options=None) -> RepresentationOutput:
        # emb_texts = self.encoder(texts, options=options)
        emb_texts = self.encoder(texts, options=options)
        # Computes the mean over the time dimension (vocab output is batch x time x dim)
        emb_texts.value = emb_texts.value.mean(1)
        return emb_texts




[docs]
class TopicTextConverter(Converter[TextRecord, str]):
    """Extracts the text from a topic"""

    def __call__(self, input: TextRecord) -> str:
        return input["text_item"].text