Source code for xpmir.index.anserini

from pathlib import Path
from typing import List, Tuple
from experimaestro.compat import cached_property
from experimaestro import Choices, Param, Annotated
from datamaestro_text.data.ir import AdhocIndex


[docs]class Index(AdhocIndex): """Anserini-backed index Attributes: path: Path to the index storePositions: Store term positions storeDocvectors: Store document term vectors storeRaw: Store raw document storeContents: Store processed documents (e.g. without HTML tags) stemmer: The stemmer to use """ path: Param[Path] storePositions: Param[bool] = False storeDocvectors: Param[bool] = False storeRaw: Param[bool] = False storeContents: Param[bool] = False stemmer: Annotated[str, Choices(["porter", "krovetz", "none"])] = "porter" _index_reader = None _stats = None @cached_property def index_reader(self): from pyserini.index import IndexReader return IndexReader(str(self.path)) def __getstate__(self): return { key: value for key, value in self.__dict__.items() if key != "index_reader" } @cached_property def documentcount(self) -> int: return self.index_reader.stats()["documents"] @cached_property def termcount(self): return self.index_reader.stats()["total_terms"] def document_text(self, docid): doc = self.index_reader.doc(docid) return doc.contents() if doc is not None else None @cached_property def terms(self): """Returns a map""" return {entry.term: (entry.df, entry.cf) for entry in self.index_reader.terms()} def iter_documents(self) -> List[Tuple[str, str]]: """Returns the document contents""" for i in range(self.documentcount): docid = self.index_reader.convert_internal_docid_to_collection_docid(i) yield docid, self.document_text(docid) def docid_internal2external(self, ix): return self.index_reader.convert_internal_docid_to_collection_docid(ix) def term_df(self, term: str): x: List[str] = self.index_reader.analyze(term) if x: return self.terms.get(x[0], (0, 0))[0] return 0