xuebwang-amd
diff --git a/‎tests/samplers/test_logprobs.py‎
Lines changed: 96 additions & 0 deletions b/‎tests/samplers/test_logprobs.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎tests/samplers/test_ranks.py‎
Lines changed: 0 additions & 59 deletions b/‎tests/samplers/test_ranks.py‎
Lines changed: 0 additions & 59 deletions
diff --git a/‎tests/test_logprobs.py‎
Lines changed: 222 additions & 0 deletions b/‎tests/test_logprobs.py‎
Lines changed: 222 additions & 0 deletions
diff --git a/‎vllm/envs.py‎
Lines changed: 6 additions & 0 deletions b/‎vllm/envs.py‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,96 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import pytest
+
+from vllm import SamplingParams
+from vllm.logprobs import FlattenLogprobs
+
+MODELS = ["distilbert/distilgpt2"]
+MAX_TOKENS = 5
+NUM_TOP_LOGPROBS = 5
+NUM_PROMPT_LOGPROBS = 7
+MAX_LOGPROBS = max(NUM_TOP_LOGPROBS, NUM_PROMPT_LOGPROBS)
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("greedy", [True, False])
+@pytest.mark.parametrize("flatten_logprobs", [True, False])
+def test_ranks(
+    vllm_runner,
+    model,
+    dtype,
+    greedy,
+    flatten_logprobs,
+    example_prompts,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1" if flatten_logprobs else "0")
+    with vllm_runner(model, dtype=dtype, max_logprobs=MAX_LOGPROBS) as vllm_model:
+        tokenizer = vllm_model.llm.get_tokenizer()
+        example_prompt_tokens = [tokenizer.encode(prompt) for prompt in example_prompts]
+        sampling_params = SamplingParams(
+            temperature=0.0 if greedy else 1.0,
+            top_p=1.0,
+            max_tokens=MAX_TOKENS,
+            logprobs=NUM_TOP_LOGPROBS,
+            prompt_logprobs=NUM_PROMPT_LOGPROBS,
+        )
+        results = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
+    assert len(results) == len(example_prompt_tokens)
+    for i, (result, prompt_tokens) in enumerate(zip(results, example_prompt_tokens)):
+        decode_tokens, _, decode_logprobs, prompt_logprobs = result
+
+        # Ensure the return type of logprobs is accurate
+        assert isinstance(
+            prompt_logprobs, FlattenLogprobs if flatten_logprobs else list
+        )
+        assert isinstance(
+            decode_logprobs, FlattenLogprobs if flatten_logprobs else list
+        )
+
+        ########################
+        # Check prompt logprobs
+        ########################
+        assert len(prompt_tokens) == len(prompt_logprobs)
+        # No logprob for first prompt token
+        assert not prompt_logprobs[0]
+        for position, (token, logprobs) in enumerate(
+            zip(prompt_tokens[1:], prompt_logprobs[1:]), start=1
+        ):
+            # Ensure logprobs of prompt token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_PROMPT_LOGPROBS or NUM_PROMPT_LOGPROBS+1
+            assert NUM_PROMPT_LOGPROBS <= len(logprobs) <= NUM_PROMPT_LOGPROBS + 1
+            # Ensure top NUM_PROMPT_LOGPROBS is always extracted
+            assert set(range(1, NUM_PROMPT_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
+
+        ########################
+        # Check sample logprobs
+        ########################
+        assert len(decode_tokens) == len(decode_logprobs)
+        for position, (token, logprobs) in enumerate(
+            zip(decode_tokens, decode_logprobs)
+        ):
+            # Ensure logprobs of chosen token is always returned
+            logprob = logprobs.get(token)
+            assert logprob is not None
+            if greedy:
+                # For greedy sampling, all chosen logprob should be top ranked
+                assert logprob.rank == 1
+            else:
+                assert logprob.rank >= 1
+            # Ensure # of returned logprobs should be
+            # either NUM_TOP_LOGPROBS or NUM_TOP_LOGPROBS+1
+            assert NUM_TOP_LOGPROBS <= len(logprobs) <= NUM_TOP_LOGPROBS + 1
+            # Ensure top NUM_TOP_LOGPROBS logprobs is always extracted
+            assert set(range(1, NUM_TOP_LOGPROBS + 1)).issubset(
+                {logprob.rank for logprob in logprobs.values()}
+            )
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+import pytest
+
+from vllm.logprobs import (
+    FlattenLogprobs,
+    Logprob,
+    LogprobsOnePosition,
+    append_logprobs_for_next_position,
+    create_prompt_logprobs,
+    create_sample_logprobs,
+)
+
+
+def test_create_logprobs_non_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "0")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, list)
+    # Ensure first prompt position logprobs is None
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] is None
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, list)
+    assert len(sample_logprobs) == 0
+
+
+def test_create_logprobs_flatten(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1")
+
+    prompt_logprobs = create_prompt_logprobs()
+    assert isinstance(prompt_logprobs, FlattenLogprobs)
+    assert prompt_logprobs.start_indices == [0]
+    assert prompt_logprobs.end_indices == [0]
+    assert len(prompt_logprobs.token_ids) == 0
+    assert len(prompt_logprobs.logprobs) == 0
+    assert len(prompt_logprobs.ranks) == 0
+    assert len(prompt_logprobs.decoded_tokens) == 0
+    # Ensure first prompt position logprobs is empty
+    assert len(prompt_logprobs) == 1
+    assert prompt_logprobs[0] == dict()
+
+    sample_logprobs = create_sample_logprobs()
+    assert isinstance(sample_logprobs, FlattenLogprobs)
+    assert len(sample_logprobs.start_indices) == 0
+    assert len(sample_logprobs.end_indices) == 0
+    assert len(sample_logprobs.token_ids) == 0
+    assert len(sample_logprobs.logprobs) == 0
+    assert len(sample_logprobs.ranks) == 0
+    assert len(sample_logprobs.decoded_tokens) == 0
+    assert len(sample_logprobs) == 0
+
+
+def test_append_logprobs_for_next_position_none_flatten(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "0")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, list)
+    assert logprobs == [
+        {1: Logprob(logprob=0.1, rank=10, decoded_token="1")},
+        {
+            2: Logprob(logprob=0.2, rank=11, decoded_token="2"),
+            3: Logprob(logprob=0.3, rank=1, decoded_token="3"),
+        },
+    ]
+
+
+def test_append_logprobs_for_next_position_flatten(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setenv("VLLM_FLATTEN_LOGPROBS", "1")
+    logprobs = create_sample_logprobs()
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[1],
+        logprobs=[0.1],
+        decoded_tokens=["1"],
+        rank=10,
+        num_logprobs=-1,
+    )
+    append_logprobs_for_next_position(
+        logprobs,
+        token_ids=[2, 3],
+        logprobs=[0.2, 0.3],
+        decoded_tokens=["2", "3"],
+        rank=11,
+        num_logprobs=-1,
+    )
+    assert isinstance(logprobs, FlattenLogprobs)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 11, 1]
+    assert logprobs.decoded_tokens == ["1", "2", "3"]
+
+
+LOGPROBS_ONE_POSITION_0: LogprobsOnePosition = {
+    1: Logprob(logprob=0.1, rank=10, decoded_token="10")
+}
+LOGPROBS_ONE_POSITION_1: LogprobsOnePosition = {
+    2: Logprob(logprob=0.2, rank=20, decoded_token="20"),
+    3: Logprob(logprob=0.3, rank=30, decoded_token="30"),
+}
+LOGPROBS_ONE_POSITION_2: LogprobsOnePosition = {
+    4: Logprob(logprob=0.4, rank=40, decoded_token="40"),
+    5: Logprob(logprob=0.5, rank=50, decoded_token="50"),
+    6: Logprob(logprob=0.6, rank=60, decoded_token="60"),
+}
+
+
+def test_flatten_logprobs_append() -> None:
+    logprobs = FlattenLogprobs()
+    logprobs.append(LOGPROBS_ONE_POSITION_0)
+    logprobs.append(LOGPROBS_ONE_POSITION_1)
+    assert logprobs.start_indices == [0, 1]
+    assert logprobs.end_indices == [1, 3]
+    assert logprobs.token_ids == [1, 2, 3]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3]
+    assert logprobs.ranks == [10, 20, 30]
+    assert logprobs.decoded_tokens == ["10", "20", "30"]
+
+    logprobs.append(LOGPROBS_ONE_POSITION_2)
+    assert logprobs.start_indices == [0, 1, 3]
+    assert logprobs.end_indices == [1, 3, 6]
+    assert logprobs.token_ids == [1, 2, 3, 4, 5, 6]
+    assert logprobs.logprobs == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs.ranks == [10, 20, 30, 40, 50, 60]
+    assert logprobs.decoded_tokens == ["10", "20", "30", "40", "50", "60"]
+
+
+def test_flatten_logprobs_extend() -> None:
+    logprobs = FlattenLogprobs()
+    # Extend with list[LogprobsOnePosition]
+    logprobs.extend([LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0])
+    assert logprobs.start_indices == [0, 3]
+    assert logprobs.end_indices == [3, 4]
+    assert logprobs.token_ids == [4, 5, 6, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10"]
+
+    other_logprobs = FlattenLogprobs()
+    other_logprobs.extend([LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_0])
+    # Extend with another FlattenLogprobs
+    logprobs.extend(other_logprobs)
+    assert logprobs.start_indices == [0, 3, 4, 6]
+    assert logprobs.end_indices == [3, 4, 6, 7]
+    assert logprobs.token_ids == [4, 5, 6, 1, 2, 3, 1]
+    assert logprobs.logprobs == [0.4, 0.5, 0.6, 0.1, 0.2, 0.3, 0.1]
+    assert logprobs.ranks == [40, 50, 60, 10, 20, 30, 10]
+    assert logprobs.decoded_tokens == ["40", "50", "60", "10", "20", "30", "10"]
+
+
+def test_flatten_logprobs_access() -> None:
+    logprobs = FlattenLogprobs()
+    logprobs.extend(
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0]
+    )
+    assert logprobs.start_indices == [0, 2, 5]
+    assert logprobs.end_indices == [2, 5, 6]
+    assert logprobs.token_ids == [2, 3, 4, 5, 6, 1]
+    assert logprobs.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6, 0.1]
+    assert logprobs.ranks == [20, 30, 40, 50, 60, 10]
+    assert logprobs.decoded_tokens == ["20", "30", "40", "50", "60", "10"]
+
+    # Test __len__
+    assert len(logprobs) == 3
+
+    # Test __iter__
+    for actual_logprobs, expected_logprobs in zip(
+        logprobs,
+        [LOGPROBS_ONE_POSITION_1, LOGPROBS_ONE_POSITION_2, LOGPROBS_ONE_POSITION_0],
+    ):
+        assert actual_logprobs == expected_logprobs
+
+    # Test __getitem__ : single item
+    assert logprobs[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs[2] == LOGPROBS_ONE_POSITION_0
+
+    # Test __getitem__ : slice
+    logprobs02 = logprobs[:2]
+    assert len(logprobs02) == 2
+    assert logprobs02[0] == LOGPROBS_ONE_POSITION_1
+    assert logprobs02[1] == LOGPROBS_ONE_POSITION_2
+    assert logprobs02.start_indices == [0, 2]
+    assert logprobs02.end_indices == [2, 5]
+    assert logprobs02.token_ids == [2, 3, 4, 5, 6]
+    assert logprobs02.logprobs == [0.2, 0.3, 0.4, 0.5, 0.6]
+    assert logprobs02.ranks == [20, 30, 40, 50, 60]
+    assert logprobs02.decoded_tokens == ["20", "30", "40", "50", "60"]
+    logprobs_last2 = logprobs[-2:]
+    assert len(logprobs_last2) == 2
+    assert logprobs_last2[0] == LOGPROBS_ONE_POSITION_2
+    assert logprobs_last2[1] == LOGPROBS_ONE_POSITION_0
+    assert logprobs_last2.start_indices == [0, 3]
+    assert logprobs_last2.end_indices == [3, 4]
+    assert logprobs_last2.token_ids == [4, 5, 6, 1]
+    assert logprobs_last2.logprobs == [0.4, 0.5, 0.6, 0.1]
+    assert logprobs_last2.ranks == [40, 50, 60, 10]
+    assert logprobs_last2.decoded_tokens == ["40", "50", "60", "10"]
@@ -220,6 +220,7 @@
     VLLM_GC_DEBUG: str = ""
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
     VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
+    VLLM_FLATTEN_LOGPROBS: bool = False
 
 
 def get_default_cache_root():
@@ -1463,6 +1464,11 @@ def get_vllm_port() -> int | None:
     "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
         "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
     ),
+    # Flag to enable FlattenLogprobs whose GC overhead is significantly smaller than
+    # the original list[dict[int, Logprob]] approach.
+    # After enabled, PromptLogprobs and SampleLogprobs would populated as
+    # FlattenLogprobs.
+    "VLLM_FLATTEN_LOGPROBS": lambda: bool(int(os.getenv("VLLM_FLATTEN_LOGPROBS", "0"))),
 }
 
 # --8<-- [end:env-vars-definition]