Source code for xpmir.papers.helpers.samplers

# Utility functions for MS-Marco experiments

from typing import Union
from functools import lru_cache

from experimaestro import Launcher

from datamaestro import prepare_dataset
from datamaestro_text.transforms.ir import (
    ShuffledTrainingTripletsLines,
    StoreTrainingTripletTopicAdapter,
    StoreTrainingTripletDocumentAdapter,
)
from datamaestro_text.data.ir import Documents, Adhoc

from xpmir.utils.functools import cache
from xpmir.datasets.adapters import RandomFold
from xpmir.evaluation import Evaluations, EvaluationsCollection
from xpmir.letor.samplers import TripletBasedSampler
from xpmir.datasets.adapters import MemoryTopicStore
from xpmir.letor.distillation.samplers import (
    DistillationPairwiseSampler,
    PairwiseHydrator,
)
from xpmir.letor.samplers.hydrators import SampleHydrator, PairwiseTransformAdapter

from xpmir.measures import AP, RR, P, nDCG, Success
from xpmir.papers import configuration


[docs]@configuration class ValidationSample: seed: int = 123 size: int = 500
@lru_cache def prepare_collection(prepare_str: str) -> Union[Documents, Adhoc]: """Prepare a dataset and caches the result""" return prepare_dataset(prepare_str) MEASURES = [AP, P @ 20, nDCG, nDCG @ 10, nDCG @ 20, RR, RR @ 10, Success @ 5] # --- MsMarco v1 @cache def msmarco_v1_docpairs_sampler( *, sample_rate: float = 1.0, sample_max: int = 0, launcher: "Launcher" = None, ) -> TripletBasedSampler: """Train sampler (deprecated: use msmarco_v1_docpairs_efficient_sampler) This uses shuffled pre-computed triplets from MS Marco :param sample_rate: Sample rate for the triplets (default 1) """ topics = prepare_dataset("irds.msmarco-passage.train.queries") train_triples = prepare_dataset("irds.msmarco-passage.train.docpairs") triplets = ShuffledTrainingTripletsLines( seed=123, data=StoreTrainingTripletTopicAdapter(data=train_triples, store=topics), sample_rate=sample_rate, sample_max=sample_max, doc_ids=True, topic_ids=False, ).submit(launcher=launcher) # Adds the text to the documents triplets = StoreTrainingTripletDocumentAdapter( data=triplets, store=prepare_collection("irds.msmarco-passage.documents") ) return TripletBasedSampler(source=triplets) @cache def msmarco_v1_docpairs_efficient_sampler( *, sample_rate: float = 1.0, sample_max: int = 0, launcher: "Launcher" = None, seed: int = 123, ) -> TripletBasedSampler: """Train sampler This uses shuffled pre-computed triplets from MS Marco :param sample_rate: Sample rate for the triplets (default 1) """ topics = prepare_dataset("irds.msmarco-passage.train.queries") train_triples = prepare_dataset("irds.msmarco-passage.train.docpairs") triplets = ShuffledTrainingTripletsLines( seed=seed, data=StoreTrainingTripletTopicAdapter(data=train_triples, store=topics), sample_rate=sample_rate, sample_max=sample_max, doc_ids=True, topic_ids=False, ).submit(launcher=launcher) # Builds the sampler by hydrating documents sampler = TripletBasedSampler(source=triplets) hydrator = SampleHydrator( documentstore=prepare_collection("irds.msmarco-passage.documents") ) return PairwiseTransformAdapter(sampler=sampler, adapter=hydrator) @cache def msmarco_v1_validation_dataset(cfg: ValidationSample, launcher=None): """Sample dev topics to get a validation subset""" return RandomFold( dataset=prepare_collection("irds.msmarco-passage.dev"), seed=cfg.seed, fold=0, sizes=[cfg.size], exclude=prepare_collection("irds.msmarco-passage.dev.small").topics, ).submit(launcher=launcher) @cache def msmarco_v1_tests(dev_test_size: int = 0): """MS-Marco default test collections: DL TREC 2019 & 2020 + devsmall devsmall can be restricted to a smaller dataset for debugging using dev_test_size """ v1_devsmall_ds = prepare_collection("irds.msmarco-passage.dev.small") if dev_test_size > 0: (v1_devsmall_ds,) = RandomFold.folds( seed=0, sizes=[dev_test_size], dataset=v1_devsmall_ds ) return EvaluationsCollection( msmarco_dev=Evaluations(v1_devsmall_ds, MEASURES), trec2019=Evaluations( prepare_dataset("irds.msmarco-passage.trec-dl-2019"), MEASURES ), trec2020=Evaluations( prepare_dataset("irds.msmarco-passage.trec-dl-2020"), MEASURES ), ) @cache def msmarco_hofstaetter_ensemble_hard_negatives() -> DistillationPairwiseSampler: """Hard negatives from Hofstätter et al. (2020) Hard negatives trained by distillation with cross-encoder Improving Efficient Neural Ranking Models with Cross-Architecture Knowledge Distillation, (Sebastian Hofstätter, Sophia Althammer, Michael Schröder, Mete Sertkan, Allan Hanbury), 2020 """ train_triples_distil = prepare_dataset( "com.github.sebastian-hofstaetter." "neural-ranking-kd.msmarco.ensemble.teacher" ) # Access to topic text train_topics = prepare_dataset("irds.msmarco-passage.train.queries") # Combine the training triplets with the document and queries texts distillation_samples = PairwiseHydrator( samples=train_triples_distil, documentstore=prepare_collection("irds.msmarco-passage.documents"), querystore=MemoryTopicStore(topics=train_topics), ) # Generate a sampler from the samples return DistillationPairwiseSampler(samples=distillation_samples) @cache def finetuning_validation_dataset( cfg: ValidationSample, dataset_id: str, launcher=None ): """Sample dev topics to get a validation subset""" return RandomFold( dataset=prepare_collection(dataset_id), seed=cfg.seed, fold=0, sizes=[cfg.size], ).submit(launcher=launcher)