Update commentary around model document encoding

ghukill · ghukill · commit 296b87052ba6 · 2025-11-05T15:20:49.000-05:00
diff --git a/embeddings/cli.py b/embeddings/cli.py
@@ -230,10 +230,10 @@ def create_embeddings(
     )
 
     # create an iterator of EmbeddingInputs applying all requested strategies
-    input_records = create_embedding_inputs(timdex_records, list(strategy))
+    embedding_inputs = create_embedding_inputs(timdex_records, list(strategy))
 
     # create embeddings via the embedding model
-    embeddings = model.create_embeddings(input_records)
+    embeddings = model.create_embeddings(embedding_inputs)
 
     # if requested, write embeddings to a local JSONLines file
     if output_jsonl:
diff --git a/embeddings/models/base.py b/embeddings/models/base.py
@@ -51,20 +51,20 @@ def load(self) -> None:
         """Load model from self.model_path."""
 
     @abstractmethod
-    def create_embedding(self, input_record: EmbeddingInput) -> Embedding:
+    def create_embedding(self, embedding_input: EmbeddingInput) -> Embedding:
         """Create an Embedding for an EmbeddingInput.
 
         Args:
-            input_record: EmbeddingInput instance
+            embedding_input: EmbeddingInput instance
         """
 
     def create_embeddings(
-        self, input_records: Iterator[EmbeddingInput]
+        self, embedding_inputs: Iterator[EmbeddingInput]
     ) -> Iterator[Embedding]:
-        """Yield Embeddings for an iterator of InputRecords.
+        """Yield Embeddings for a batch of EmbeddingInputs.
 
         Args:
-            input_records: iterator of InputRecords
+            embedding_inputs: iterator of EmbeddingInputs
         """
-        for input_text in input_records:
-            yield self.create_embedding(input_text)
+        for embedding_input in embedding_inputs:
+            yield self.create_embedding(embedding_input)
diff --git a/embeddings/models/os_neural_sparse_doc_v3_gte.py b/embeddings/models/os_neural_sparse_doc_v3_gte.py
@@ -173,32 +173,24 @@ def load(self) -> None:
             f"{time.perf_counter() - start_time:.2f}s"
         )
 
-    def create_embedding(self, input_record: EmbeddingInput) -> Embedding:
-        """Create sparse embeddings for the input text (document encoding).
-
-        This method generates sparse document embeddings.
-
-        Process follows the model card exactly:
-        1. Tokenize the document
-        2. Pass through the masked language model to get logits
-        3. Convert logits to sparse vector
-        6. Return both raw sparse vector and decoded token-weight pairs
+    def create_embedding(self, embedding_input: EmbeddingInput) -> Embedding:
+        """Create sparse vector and decoded token weight embeddings for an input text.
 
         Args:
-            input_record: The input containing text to embed
+            embedding_input: EmbeddingInput object with a .text attribute
         """
         # generate the sparse embeddings
-        sparse_vector, decoded_tokens = self._encode_documents([input_record.text])[0]
+        sparse_vector, decoded_tokens = self._encode_documents([embedding_input.text])[0]
 
         # coerce sparse vector tensor into list[float]
         sparse_vector_list = sparse_vector.cpu().numpy().tolist()
 
         return Embedding(
-            timdex_record_id=input_record.timdex_record_id,
-            run_id=input_record.run_id,
-            run_record_offset=input_record.run_record_offset,
+            timdex_record_id=embedding_input.timdex_record_id,
+            run_id=embedding_input.run_id,
+            run_record_offset=embedding_input.run_record_offset,
             model_uri=self.model_uri,
-            embedding_strategy=input_record.embedding_strategy,
+            embedding_strategy=embedding_input.embedding_strategy,
             embedding_vector=sparse_vector_list,
             embedding_token_weights=decoded_tokens,
         )
@@ -212,53 +204,32 @@ def _encode_documents(
         This follows the pattern outlined on the HuggingFace model card for document
         encoding.
 
-        This method will accommodate a list of text inputs, and return a list of
-        embeddings, but the calling base method create_embeddings() is a singular input +
+        This method will accommodate MULTIPLE text inputs, and return a list of
+        embeddings, but the calling context of create_embedding() is a SINGULAR input +
         output.  This method keeps the ability to handle multiple inputs + outputs, in the
-        event we want something like a create_multiple_embeddings() method in the future.
-
-        The following is a rough approximation of receiving logits back from the model
-        and converting this to a sparse vector which can then be decoded to token:weights:
+        event we want something like a create_multiple_embeddings() method in the future,
+        but only returns a single result.
 
-        ----------------------------------------------------------------------------------
-        Imagine your vocabulary is just 5 words: ["cat", "dog", "bird", "fish", "tree"]
-        Vocabulary indices:                      [  0,     1,      2,      3,       4]
+        At a very high level, the following is performed:
 
-        1. MODEL RETURNS LOGITS
-        Let's say you input the text: "cat and dog"
-        After tokenization, you have 3 tokens at 3 sequence positions
-        The model outputs logits - a score for EVERY vocab word at EVERY position:
+        1. We tokenize the input text into "features" using the model's tokenizer.
 
-        logits = [
-            # Position 0 (word "cat"):  scores for each vocab word at this position
-            [9.2,  1.1,  0.3,  0.5,  0.2],  # "cat" gets high score (9.2)
-
-            # Position 1 (word "and" - not in our toy vocab, but tokenized somehow):
-            [2.1,  1.8,  0.4,  0.3,  0.9],  # moderate scores everywhere
-
-            # Position 2 (word "dog"):
-            [0.8,  8.7,  0.2,  0.4,  0.1],  # "dog" gets high score (8.7)
-        ]
-        Shape: (3 positions, 5 vocab words)
+        2. The features are fed to the model returning model output logits. These logits
+        are "dense" in the sense there are few zeros, but they are not "dense vectors"
+        (embeddings) in the sense that they meaningfully represent the input document in
+        geometric space; two logit tensors cannot be compared with something like cosine
+        similarity.
 
+        3. The logits are then converted into a sparse vector, which is a numeric
+        array of floats with the same number of values as the model's vocabulary. Each
+        value's position in the sparse array corresponds to the token id in the
+        vocabulary, and the value itself is the "weight" of this token in the input text.
 
-        2. PRODUCE SPARSE VECTORS FROM LOGITS
-        We collapse the sequence positions by taking the MAX score for each vocab word:
-
-        sparse_vector = [
-            max(9.2, 2.1, 0.8),  # "cat": take max across all 3 positions = 9.2
-            max(1.1, 1.8, 8.7),  # "dog": take max = 8.7
-            max(0.3, 0.4, 0.2),  # "bird": take max = 0.4
-            max(0.5, 0.3, 0.4),  # "fish": take max = 0.5
-            max(0.2, 0.9, 0.1),  # "tree": take max = 0.9
-        ]
-
-        Apply transformations (ReLU, double-log) to make it sparser:
-        sparse_vector = [5.1, 4.8, 0.0, 0.0, 0.0]  # smaller values become 0
-
-        Final result:
-        {"cat": 5.1, "dog": 4.8}  # Only the relevant words have non-zero weights
-        ----------------------------------------------------------------------------------
+        4. Lastly, we convert this sparse vector into a {token:weight} dictionary of the
+        actual token strings and their numerical weight. This dictionary may contain
+        tokens not present in the original text, but will be considerably shorter than
+        the model vocabulary length given all zero and low scoring tokens are dropped.
+        This is the final form that we will ultimately index into OpenSearch.
 
         Args:
             texts: list of strings to create embeddings for
@@ -278,14 +249,14 @@ def _encode_documents(
         # move to CPU or GPU device, depending on what's available
         features = {k: v.to(self._device) for k, v in features.items()}
 
-        # get model logits output
+        # pass features to the model and receive model output logits as a tensor
         with torch.no_grad():
             output = self._model(**features)[0]
 
-        # generate sparse vectors from model logits
+        # generate sparse vectors from model logits tensor
         sparse_vectors = self._get_sparse_vectors(features, output)
 
-        # decode to token-weight dictionaries
+        # decode sparse vectors to token-weight dictionaries
         decoded = self._decode_sparse_vectors(sparse_vectors)
 
         # return list of tuple(vector, decoded token weights) embedding results
@@ -304,20 +275,26 @@ def _get_sparse_vectors(
             2. log(1 + log(1 + relu())) transformation
             3. Zero out special tokens
 
+        The end resul is a sparse vector with a length of the model vocabulary, with each
+        position representing a token in the model vocabulary and each value representing
+        that token's weight relative to the input text.
+
         Args:
             features: Tokenizer output with attention_mask
             output: Model logits of shape (batch_size, seq_len, vocab_size)
 
         Returns:
             Sparse vectors of shape (batch_size, vocab_size)
         """
-        # max pooling with attention mask
+        # collapse sequence positions: take max logit for each vocab token across all
+        # positions (also masks out padding tokens)
         values, _ = torch.max(output * features["attention_mask"].unsqueeze(-1), dim=1)
 
-        # apply the v3 model activation
+        # compress values to create sparsity: ReLU removes negatives,
+        # double-log shrinks large values
         values = torch.log(1 + torch.log(1 + torch.relu(values)))
 
-        # zero out special tokens
+        # remove special tokens like [CLS], [SEP], [PAD]
         values[:, self._special_token_ids] = 0
 
         return values
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -45,12 +45,12 @@ def download(self) -> Path:
     def load(self) -> None:
         logger.info("Model loaded successfully, 1.5s")
 
-    def create_embedding(self, input_record: EmbeddingInput) -> Embedding:
+    def create_embedding(self, embedding_input: EmbeddingInput) -> Embedding:
         return Embedding(
-            timdex_record_id=input_record.timdex_record_id,
-            run_id=input_record.run_id,
-            run_record_offset=input_record.run_record_offset,
-            embedding_strategy=input_record.embedding_strategy,
+            timdex_record_id=embedding_input.timdex_record_id,
+            run_id=embedding_input.run_id,
+            run_record_offset=embedding_input.run_record_offset,
+            embedding_strategy=embedding_input.embedding_strategy,
             model_uri=self.model_uri,
             embedding_vector=[0.1, 0.2, 0.3],
             embedding_token_weights={"coffee": 0.9, "seattle": 0.5},
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -35,14 +35,14 @@ def test_mock_model_load(caplog, mock_model):
 
 
 def test_mock_model_create_embedding(mock_model):
-    input_record = EmbeddingInput(
+    embedding_input = EmbeddingInput(
         timdex_record_id="test-id",
         run_id="test-run",
         run_record_offset=42,
         embedding_strategy="full_record",
         text="test text",
     )
-    embedding = mock_model.create_embedding(input_record)
+    embedding = mock_model.create_embedding(embedding_input)
 
     assert embedding.timdex_record_id == "test-id"
     assert embedding.run_id == "test-run"
@@ -87,7 +87,7 @@ class InvalidModel(BaseEmbeddingModel):
 
 
 def test_base_model_create_embeddings_calls_create_embedding(mock_model):
-    input_records = [
+    embedding_inputs = [
         EmbeddingInput(
             timdex_record_id="id-1",
             run_id="run-1",
@@ -105,7 +105,7 @@ def test_base_model_create_embeddings_calls_create_embedding(mock_model):
     ]
 
     # create_embeddings should iterate and call create_embedding
-    embeddings = list(mock_model.create_embeddings(iter(input_records)))
+    embeddings = list(mock_model.create_embeddings(iter(embedding_inputs)))
 
     assert len(embeddings) == 2  # two input records
     assert embeddings[0].timdex_record_id == "id-1"
diff --git a/tests/test_os_neural_sparse_doc_v3_gte.py b/tests/test_os_neural_sparse_doc_v3_gte.py
@@ -293,7 +293,7 @@ def test_load_sets_up_special_token_ids(
 def test_create_embedding_raises_error_if_model_not_loaded(tmp_path):
     """Test create_embedding raises RuntimeError if model not loaded."""
     model = OSNeuralSparseDocV3GTE(tmp_path / "model")
-    input_record = EmbeddingInput(
+    embedding_input = EmbeddingInput(
         timdex_record_id="test:123",
         run_id="run-456",
         run_record_offset=0,
@@ -302,7 +302,7 @@ def test_create_embedding_raises_error_if_model_not_loaded(tmp_path):
     )
 
     with pytest.raises(RuntimeError, match="Model not loaded"):
-        model.create_embedding(input_record)
+        model.create_embedding(embedding_input)
 
 
 def test_create_embedding_returns_embedding_object(tmp_path, monkeypatch):
@@ -317,15 +317,15 @@ def mock_encode_documents(texts):
 
     monkeypatch.setattr(model, "_encode_documents", mock_encode_documents)
 
-    input_record = EmbeddingInput(
+    embedding_input = EmbeddingInput(
         timdex_record_id="test:123",
         run_id="run-456",
         run_record_offset=42,
         embedding_strategy="title_only",
         text="test document",
     )
 
-    embedding = model.create_embedding(input_record)
+    embedding = model.create_embedding(embedding_input)
 
     assert embedding.timdex_record_id == "test:123"
     assert embedding.run_id == "run-456"