diff --git a/docs/_src/api/api/retriever.md b/docs/_src/api/api/retriever.md index ad6078caa6..34e2875eb3 100644 --- a/docs/_src/api/api/retriever.md +++ b/docs/_src/api/api/retriever.md @@ -28,13 +28,7 @@ Base class for regular retrievers. ```python @abstractmethod -def retrieve(query: str, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, - bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -67,13 +61,7 @@ Wrapper method used to time functions. #### BaseRetriever.eval ```python -def eval(label_index: str = "label", - doc_index: str = "eval_document", - label_origin: str = "gold-label", - top_k: int = 10, - open_domain: bool = False, - return_preds: bool = False, - headers: Optional[Dict[str, str]] = None) -> dict +def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict ``` Performs evaluation on the Retriever. @@ -122,11 +110,7 @@ class BM25Retriever(BaseRetriever) #### BM25Retriever.\_\_init\_\_ ```python -def __init__(document_store: KeywordDocumentStore, - top_k: int = 10, - all_terms_must_match: bool = False, - custom_query: Optional[str] = None, - scale_score: bool = True) +def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True) ``` **Arguments**: @@ -210,13 +194,7 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BM25Retriever.retrieve ```python -def retrieve(query: str, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, - bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -302,18 +280,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### BM25Retriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], - filters: Optional[Union[Dict[str, Union[Dict, List, str, - int, float, bool]], - List[Dict[str, - Union[Dict, List, str, - int, float, - bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -414,12 +386,7 @@ Helpful for benchmarking, testing and if you want to do QA on small documents wi #### FilterRetriever.retrieve ```python -def retrieve(query: str, - filters: dict = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -458,9 +425,7 @@ It uses sklearn's TfidfVectorizer to compute a tf-idf matrix. #### TfidfRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, - top_k: int = 10, - auto_fit=True) +def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True) ``` **Arguments**: @@ -474,16 +439,12 @@ def __init__(document_store: BaseDocumentStore, #### TfidfRetriever.retrieve ```python -def retrieve(query: str, - filters: Optional[Union[Dict[str, Union[Dict, List, str, int, - float, bool]], - List[Dict[str, - Union[Dict, List, str, int, - float, bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -505,14 +466,7 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### TfidfRetriever.retrieve\_batch ```python -def retrieve_batch(queries: Union[str, List[str]], - filters: Optional[Dict[str, Union[Dict, List, str, int, - float, bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: Union[str, List[str]], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -565,25 +519,7 @@ Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Que #### DensePassageRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, - query_embedding_model: Union[ - Path, str] = "facebook/dpr-question_encoder-single-nq-base", - passage_embedding_model: Union[ - Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", - model_version: Optional[str] = None, - max_seq_len_query: int = 64, - max_seq_len_passage: int = 256, - top_k: int = 10, - use_gpu: bool = True, - batch_size: int = 16, - embed_title: bool = True, - use_fast_tokenizers: bool = True, - similarity_function: str = "dot_product", - global_loss_buffer_size: int = 150000, - progress_bar: bool = True, - devices: Optional[List[Union[str, torch.device]]] = None, - use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True) +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -651,13 +587,7 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### DensePassageRetriever.retrieve ```python -def retrieve(query: str, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, - bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -741,18 +671,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### DensePassageRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], - filters: Optional[Union[Dict[str, Union[Dict, List, str, - int, float, bool]], - List[Dict[str, - Union[Dict, List, str, - int, float, - bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -878,36 +802,7 @@ Embeddings of documents / passages shape (batch_size, embedding_dim) #### DensePassageRetriever.train ```python -def train(data_dir: str, - train_filename: str, - dev_filename: str = None, - test_filename: str = None, - max_samples: int = None, - max_processes: int = 128, - multiprocessing_strategy: Optional[str] = None, - dev_split: float = 0, - batch_size: int = 2, - embed_title: bool = True, - num_hard_negatives: int = 1, - num_positives: int = 1, - n_epochs: int = 3, - evaluate_every: int = 1000, - n_gpu: int = 1, - learning_rate: float = 1e-5, - epsilon: float = 1e-08, - weight_decay: float = 0.0, - num_warmup_steps: int = 100, - grad_acc_steps: int = 1, - use_amp: str = None, - optimizer_name: str = "AdamW", - optimizer_correct_bias: bool = True, - save_dir: str = "../saved_models/dpr", - query_encoder_save_dir: str = "query_encoder", - passage_encoder_save_dir: str = "passage_encoder", - checkpoint_root_dir: Path = Path("model_checkpoints"), - checkpoint_every: Optional[int] = None, - checkpoints_to_keep: int = 3, - early_stopping: Optional[EarlyStopping] = None) +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None) ``` train a DensePassageRetrieval model @@ -961,9 +856,7 @@ If any checkpoints are stored, a subsequent run of train() will resume training #### DensePassageRetriever.save ```python -def save(save_dir: Union[Path, str], - query_encoder_dir: str = "query_encoder", - passage_encoder_dir: str = "passage_encoder") +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") ``` Save DensePassageRetriever to the specified directory. @@ -984,18 +877,7 @@ None ```python @classmethod -def load(cls, - load_dir: Union[Path, str], - document_store: BaseDocumentStore, - max_seq_len_query: int = 64, - max_seq_len_passage: int = 256, - use_gpu: bool = True, - batch_size: int = 16, - embed_title: bool = True, - use_fast_tokenizers: bool = True, - similarity_function: str = "dot_product", - query_encoder_dir: str = "query_encoder", - passage_encoder_dir: str = "passage_encoder") +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") ``` Load DensePassageRetriever from the specified directory. @@ -1019,30 +901,7 @@ Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using #### TableTextRetriever.\_\_init\_\_ ```python -def __init__( - document_store: BaseDocumentStore, - query_embedding_model: Union[ - Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", - passage_embedding_model: Union[ - Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", - table_embedding_model: Union[ - Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", - model_version: Optional[str] = None, - max_seq_len_query: int = 64, - max_seq_len_passage: int = 256, - max_seq_len_table: int = 256, - top_k: int = 10, - use_gpu: bool = True, - batch_size: int = 16, - embed_meta_fields: List[str] = ["name", "section_title", "caption"], - use_fast_tokenizers: bool = True, - similarity_function: str = "dot_product", - global_loss_buffer_size: int = 150000, - progress_bar: bool = True, - devices: Optional[List[Union[str, torch.device]]] = None, - use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True, - use_fast: bool = True) +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True) ``` Init the Retriever incl. the two encoder models from a local or remote model checkpoint. @@ -1097,18 +956,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### TableTextRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], - filters: Optional[Union[Dict[str, Union[Dict, List, str, - int, float, bool]], - List[Dict[str, - Union[Dict, List, str, - int, float, - bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1237,38 +1090,7 @@ Embeddings of documents / passages. Shape: (batch_size, embedding_dim) #### TableTextRetriever.train ```python -def train(data_dir: str, - train_filename: str, - dev_filename: str = None, - test_filename: str = None, - max_samples: int = None, - max_processes: int = 128, - dev_split: float = 0, - batch_size: int = 2, - embed_meta_fields: List[str] = [ - "page_title", "section_title", "caption" - ], - num_hard_negatives: int = 1, - num_positives: int = 1, - n_epochs: int = 3, - evaluate_every: int = 1000, - n_gpu: int = 1, - learning_rate: float = 1e-5, - epsilon: float = 1e-08, - weight_decay: float = 0.0, - num_warmup_steps: int = 100, - grad_acc_steps: int = 1, - use_amp: str = None, - optimizer_name: str = "AdamW", - optimizer_correct_bias: bool = True, - save_dir: str = "../saved_models/mm_retrieval", - query_encoder_save_dir: str = "query_encoder", - passage_encoder_save_dir: str = "passage_encoder", - table_encoder_save_dir: str = "table_encoder", - checkpoint_root_dir: Path = Path("model_checkpoints"), - checkpoint_every: Optional[int] = None, - checkpoints_to_keep: int = 3, - early_stopping: Optional[EarlyStopping] = None) +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, early_stopping: Optional[EarlyStopping] = None) ``` Train a TableTextRetrieval model. @@ -1322,10 +1144,7 @@ checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is cr #### TableTextRetriever.save ```python -def save(save_dir: Union[Path, str], - query_encoder_dir: str = "query_encoder", - passage_encoder_dir: str = "passage_encoder", - table_encoder_dir: str = "table_encoder") +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") ``` Save TableTextRetriever to the specified directory. @@ -1347,20 +1166,7 @@ None ```python @classmethod -def load(cls, - load_dir: Union[Path, str], - document_store: BaseDocumentStore, - max_seq_len_query: int = 64, - max_seq_len_passage: int = 256, - max_seq_len_table: int = 256, - use_gpu: bool = True, - batch_size: int = 16, - embed_meta_fields: List[str] = ["name", "section_title", "caption"], - use_fast_tokenizers: bool = True, - similarity_function: str = "dot_product", - query_encoder_dir: str = "query_encoder", - passage_encoder_dir: str = "passage_encoder", - table_encoder_dir: str = "table_encoder") +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") ``` Load TableTextRetriever from the specified directory. @@ -1378,21 +1184,7 @@ class EmbeddingRetriever(BaseRetriever) #### EmbeddingRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, - embedding_model: str, - model_version: Optional[str] = None, - use_gpu: bool = True, - batch_size: int = 32, - max_seq_len: int = 512, - model_format: Optional[str] = None, - pooling_strategy: str = "reduce_mean", - emb_extraction_layer: int = -1, - top_k: int = 10, - progress_bar: bool = True, - devices: Optional[List[Union[str, torch.device]]] = None, - use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True, - embed_meta_fields: List[str] = []) +def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) ``` **Arguments**: @@ -1447,13 +1239,7 @@ performance if your titles contain meaningful information for retrieval #### EmbeddingRetriever.retrieve ```python -def retrieve(query: str, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, - bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -1537,18 +1323,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### EmbeddingRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], - filters: Optional[Union[Dict[str, Union[Dict, List, str, - int, float, bool]], - List[Dict[str, - Union[Dict, List, str, - int, float, - bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -1674,11 +1454,7 @@ Embeddings, one per input document #### EmbeddingRetriever.train ```python -def train(training_data: List[Dict[str, Any]], - learning_rate: float = 2e-5, - n_epochs: int = 1, - num_warmup_steps: int = None, - batch_size: int = 16) -> None +def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16, train_loss: str = "mnrl") -> None ``` Trains/adapts the underlying embedding model. @@ -1697,6 +1473,8 @@ Each training data example is a dictionary with the following keys: - `n_epochs` (`int`): The number of epochs - `num_warmup_steps` (`int`): The number of warmup steps - `batch_size` (`int (optional)`): The batch size to use for the training, defaults to 16 +- `train_loss` (`str (optional)`): The loss to use for training. +If using sentence-transformers, one of 'mnrl' (Multiple Negatives Ranking Loss) or 'margin_mse' (MarginMSE) @@ -1731,22 +1509,7 @@ Xiong, Wenhan, et. al. (2020): "Answering complex open-domain questions with mul #### MultihopEmbeddingRetriever.\_\_init\_\_ ```python -def __init__(document_store: BaseDocumentStore, - embedding_model: str, - model_version: Optional[str] = None, - num_iterations: int = 2, - use_gpu: bool = True, - batch_size: int = 32, - max_seq_len: int = 512, - model_format: str = "farm", - pooling_strategy: str = "reduce_mean", - emb_extraction_layer: int = -1, - top_k: int = 10, - progress_bar: bool = True, - devices: Optional[List[Union[str, torch.device]]] = None, - use_auth_token: Optional[Union[str, bool]] = None, - scale_score: bool = True, - embed_meta_fields: List[str] = []) +def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, num_iterations: int = 2, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) ``` **Arguments**: @@ -1802,13 +1565,7 @@ performance if your titles contain meaningful information for retrieval #### MultihopEmbeddingRetriever.retrieve ```python -def retrieve(query: str, - filters: Optional[Dict[str, Union[Dict, List, str, int, float, - bool]]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = None) -> List[Document] +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] ``` Scan through documents in DocumentStore and return a small number documents @@ -1892,18 +1649,12 @@ Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. #### MultihopEmbeddingRetriever.retrieve\_batch ```python -def retrieve_batch(queries: List[str], - filters: Optional[Union[Dict[str, Union[Dict, List, str, - int, float, bool]], - List[Dict[str, - Union[Dict, List, str, - int, float, - bool]]], ]] = None, - top_k: Optional[int] = None, - index: str = None, - headers: Optional[Dict[str, str]] = None, - batch_size: Optional[int] = None, - scale_score: bool = None) -> List[List[Document]] +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] ``` Scan through documents in DocumentStore and return a small number documents @@ -2010,10 +1761,7 @@ The generated SPARQL query is executed on a knowledge graph. #### Text2SparqlRetriever.\_\_init\_\_ ```python -def __init__(knowledge_graph, - model_name_or_path, - top_k: int = 1, - use_auth_token: Optional[Union[str, bool]] = None) +def __init__(knowledge_graph, model_name_or_path, top_k: int = 1, use_auth_token: Optional[Union[str, bool]] = None) ``` Init the Retriever by providing a knowledge graph and a pre-trained BART model diff --git a/haystack/nodes/retriever/_embedding_encoder.py b/haystack/nodes/retriever/_embedding_encoder.py index 366358555e..21636850e7 100644 --- a/haystack/nodes/retriever/_embedding_encoder.py +++ b/haystack/nodes/retriever/_embedding_encoder.py @@ -23,6 +23,12 @@ logger = logging.getLogger(__name__) +_TRAINING_LOSSES: Dict[str, Callable] = { + "mnrl": losses.MultipleNegativesRankingLoss, + "margin_mse": losses.MarginMSELoss, +} + + class _BaseEmbeddingEncoder: @abstractmethod def embed_queries(self, texts: List[str]) -> List[np.ndarray]: @@ -195,14 +201,30 @@ def train( n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16, + train_loss: str = "mnrl", ): - train_examples = [ - InputExample(texts=[i["question"], i["pos_doc"], i["neg_doc"]], label=i["score"]) for i in training_data - ] - logger.info(f"GPL training/adapting {self.embedding_model} with {len(train_examples)} examples") + if train_loss not in _TRAINING_LOSSES: + raise ValueError(f"Unrecognized train_loss {train_loss}. Should be one of: {_TRAINING_LOSSES.keys()}") + + train_examples = [] + for i in training_data: + texts = [i["question"], i["pos_doc"]] + # Negative docs are supported by all losses + if "neg_doc" in i: + texts.append(i["neg_doc"]) + if "score" not in i: + if train_loss == "margin_mse": + raise ValueError( + "Some training examples don't contain the 'score' field which is necessary when using 'margin_mse' loss." + ) + train_examples.append(InputExample(texts=texts)) + else: + train_examples.append(InputExample(texts=texts, label=i["score"])) + + logger.info(f"Training/adapting {self.embedding_model} with {len(train_examples)} examples") train_dataloader = DataLoader(train_examples, batch_size=batch_size, drop_last=True, shuffle=True) - train_loss = losses.MarginMSELoss(self.embedding_model) + train_loss = _TRAINING_LOSSES[train_loss](self.embedding_model) # Tune the model self.embedding_model.fit( diff --git a/haystack/nodes/retriever/dense.py b/haystack/nodes/retriever/dense.py index 3ae3829d42..303b7add5f 100644 --- a/haystack/nodes/retriever/dense.py +++ b/haystack/nodes/retriever/dense.py @@ -1863,6 +1863,7 @@ def train( n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16, + train_loss: str = "mnrl", ) -> None: """ Trains/adapts the underlying embedding model. @@ -1885,6 +1886,9 @@ def train( :type num_warmup_steps: int :param batch_size: The batch size to use for the training, defaults to 16 :type batch_size: int (optional) + :param train_loss: The loss to use for training. + If using sentence-transformers, one of 'mnrl' (Multiple Negatives Ranking Loss) or 'margin_mse' (MarginMSE) + :type train_loss: str (optional) """ self.embedding_encoder.train( training_data, @@ -1892,6 +1896,7 @@ def train( n_epochs=n_epochs, num_warmup_steps=num_warmup_steps, batch_size=batch_size, + train_loss=train_loss, ) def save(self, save_dir: Union[Path, str]) -> None: