vllm-project
diff --git a/‎examples/multimodal_audio/whisper_example.py‎
Lines changed: 17 additions & 6 deletions b/‎examples/multimodal_audio/whisper_example.py‎
Lines changed: 17 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py‎
Lines changed: 5 additions & 13 deletions b/‎examples/multimodal_vision/gemma3_example.py‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎examples/multimodal_vision/internvl3_example.py‎
Lines changed: 4 additions & 11 deletions b/‎examples/multimodal_vision/internvl3_example.py‎
Lines changed: 4 additions & 11 deletions
diff --git a/‎examples/multimodal_vision/llava_example.py‎
Lines changed: 0 additions & 8 deletions b/‎examples/multimodal_vision/llava_example.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/mistral3_example.py‎
Lines changed: 11 additions & 11 deletions b/‎examples/multimodal_vision/mistral3_example.py‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎examples/multimodal_vision/mllama_example.py‎
Lines changed: 0 additions & 8 deletions b/‎examples/multimodal_vision/mllama_example.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎examples/multimodal_vision/pixtral_example.py‎
Lines changed: 13 additions & 12 deletions b/‎examples/multimodal_vision/pixtral_example.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎src/llmcompressor/args/dataset_arguments.py‎
Lines changed: 33 additions & 9 deletions b/‎src/llmcompressor/args/dataset_arguments.py‎
Lines changed: 33 additions & 9 deletions
@@ -1,6 +1,10 @@
 import torch
 from datasets import load_dataset
-from transformers import WhisperForConditionalGeneration, WhisperProcessor
+from transformers import (
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    default_data_collator,
+)
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -55,20 +59,27 @@ def process(sample):
         return_tensors="pt",
     )
 
-    inputs["input_features"] = inputs["input_features"].to(dtype=model.dtype)
+    # treat labels as calibration prefill
     inputs["decoder_input_ids"] = inputs["labels"]
     del inputs["labels"]
 
+    # strip extra dim added by multimodal processors
+    inputs = {key: value[0] for key, value in inputs.items()}
+
     return inputs
 
 
 ds = ds.map(process, remove_columns=ds.column_names)
 
 
-# Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
+# Patch: mismatch between processor and model dtype
+def data_collator(features):
+    for feature in features:
+        feature["input_features"] = torch.tensor(
+            feature["input_features"], dtype=model.dtype
+        )
+
+    return default_data_collator(features, return_tensors="pt")
 
 
 # Recipe
 
@@ -1,5 +1,4 @@
 import requests
-import torch
 from PIL import Image
 from transformers import AutoProcessor, Gemma3ForConditionalGeneration
 
@@ -13,17 +12,11 @@
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
 # Oneshot arguments
-DATASET_ID = "flickr30k"
-DATASET_SPLIT = {"calibration": "test[:512]"}
+BATCH_SIZE = 4
 NUM_CALIBRATION_SAMPLES = 512
 MAX_SEQUENCE_LENGTH = 2048
-
-
-# Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
-
+DATASET_ID = "flickr30k"
+DATASET_SPLIT = {"calibration": f"test[:{NUM_CALIBRATION_SAMPLES}]"}
 
 # Recipe
 recipe = [
@@ -41,14 +34,13 @@ def data_collator(batch):
 # Perform oneshot
 oneshot(
     model=model,
-    tokenizer=model_id,
+    processor=processor,
     dataset=DATASET_ID,
     splits=DATASET_SPLIT,
     recipe=recipe,
+    batch_size=BATCH_SIZE,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
-    trust_remote_code_model=True,
-    data_collator=data_collator,
 )
 
 # Confirm generations of the quantized model look sane.
 
@@ -37,20 +37,14 @@ def preprocess_and_tokenize(example):
         return_dict=True,
         return_tensors="pt",
     )
-    return inputs
-
 
-ds = ds.map(preprocess_and_tokenize)
+    # remove extra dim added by multimodal processors
+    inputs = {key: value[0] for key, value in inputs.items()}
 
+    return inputs
 
-def data_collator(batch):
-    assert len(batch) == 1
-    item = {key: value for key, value in batch[0].items()}
-    item["attention_mask"] = torch.tensor([item["attention_mask"]])
-    item["input_ids"] = torch.LongTensor([item["input_ids"]])
-
-    return item
 
+ds = ds.map(preprocess_and_tokenize, remove_columns=ds.column_names)
 
 # Recipe
 recipe = GPTQModifier(
@@ -68,7 +62,6 @@ def data_collator(batch):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    data_collator=data_collator,
 )
 
 # Save to disk compressed.
 
@@ -1,5 +1,4 @@
 import requests
-import torch
 from PIL import Image
 from transformers import AutoProcessor, LlavaForConditionalGeneration
 
@@ -19,12 +18,6 @@
 MAX_SEQUENCE_LENGTH = 2048
 
 
-# Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
-
-
 # Recipe
 recipe = [
     GPTQModifier(
@@ -44,7 +37,6 @@ def data_collator(batch):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    data_collator=data_collator,
     sequential_targets=["LlamaDecoderLayer"],
 )
 
 
@@ -4,7 +4,11 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor, Mistral3ForConditionalGeneration
+from transformers import (
+    AutoProcessor,
+    Mistral3ForConditionalGeneration,
+    default_data_collator,
+)
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -27,17 +31,13 @@
 MAX_SEQUENCE_LENGTH = 2048
 
 
-# Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {
-        key: (
-            torch.tensor(value)
-            if key != "pixel_values"
-            else torch.tensor(value, dtype=model.dtype)
+# Patch: mismatch between processor and model dtype
+def data_collator(features):
+    for feature in features:
+        feature["pixel_values"] = torch.tensor(
+            feature["pixel_values"], dtype=model.dtype
         )
-        for key, value in batch[0].items()
-    }
+    return default_data_collator(features, return_tensors="pt")
 
 
 # Recipe
 
@@ -1,5 +1,4 @@
 import requests
-import torch
 from PIL import Image
 from transformers import AutoProcessor, MllamaForConditionalGeneration
 
@@ -19,12 +18,6 @@
 MAX_SEQUENCE_LENGTH = 2048
 
 
-# Define a oneshot data collator for multimodal inputs.
-def data_collator(batch):
-    assert len(batch) == 1
-    return {key: torch.tensor(value) for key, value in batch[0].items()}
-
-
 # Recipe
 recipe = [
     GPTQModifier(
@@ -44,7 +37,6 @@ def data_collator(batch):
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    data_collator=data_collator,
     sequential_targets=["MllamaSelfAttentionDecoderLayer"],
 )
 
 
@@ -1,7 +1,11 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor, LlavaForConditionalGeneration
+from transformers import (
+    AutoProcessor,
+    LlavaForConditionalGeneration,
+    default_data_collator,
+)
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
@@ -19,16 +23,13 @@
 MAX_SEQUENCE_LENGTH = 2048
 
 
-# Define a oneshot data collator for multimodal inputs.
-# NOTE: for transformers<4.48.0, please squeeze the first dimension of `pixel_values`
-# by appending `[0]` to the end of line 32
-def data_collator(batch):
-    assert len(batch) == 1
-    return {
-        "input_ids": torch.LongTensor(batch[0]["input_ids"]),
-        "attention_mask": torch.tensor(batch[0]["attention_mask"]),
-        "pixel_values": torch.tensor(batch[0]["pixel_values"]),
-    }
+# Patch: mismatch between processor and model dtype
+def data_collator(features):
+    for feature in features:
+        feature["pixel_values"] = torch.tensor(
+            feature["pixel_values"], dtype=model.dtype
+        )
+    return default_data_collator(features, return_tensors="pt")
 
 
 # Recipe
@@ -46,11 +47,11 @@ def data_collator(batch):
     tokenizer=model_id,
     dataset=DATASET_ID,
     splits={"calibration": f"{DATASET_SPLIT}[:{NUM_CALIBRATION_SAMPLES}]"},
+    data_collator=data_collator,
     recipe=recipe,
     max_seq_length=MAX_SEQUENCE_LENGTH,
     num_calibration_samples=NUM_CALIBRATION_SAMPLES,
     trust_remote_code_model=True,
-    data_collator=data_collator,
     sequential_targets=["MistralDecoderLayer"],
 )
 
 
@@ -8,9 +8,7 @@
 """
 
 from dataclasses import dataclass, field
-from typing import Any, Callable
-
-from transformers import DefaultDataCollator
+from typing import Callable
 
 
 @dataclass
@@ -69,9 +67,27 @@ class CustomDatasetArguments(DVCDatasetArguments):
         },
     )
 
-    data_collator: Callable[[Any], Any] = field(
-        default_factory=lambda: DefaultDataCollator(),
-        metadata={"help": "The function to used to form a batch from the dataset"},
+    batch_size: int = field(
+        default=1,
+        metadata={
+            "help": (
+                "Calibration batch size. During calibration, LLM Compressor disables "
+                "lm_head output computations to reduce memory usage from large "
+                "batch sizes. Large batch sizes may result in excess padding or "
+                "truncation, depending on the data_collator"
+            )
+        },
+    )
+
+    data_collator: str | Callable = field(
+        default="truncation",
+        metadata={
+            "help": (
+                "The function to used to form a batch from the dataset. Can also "
+                "specify 'truncation' or 'padding' to truncate or pad non-uniform "
+                "sequence lengths in a batch. Defaults to 'padding'."
+            )
+        },
     )
 
 
@@ -126,8 +142,8 @@ class DatasetArguments(CustomDatasetArguments):
         default=512,
         metadata={"help": "Number of samples to use for one-shot calibration"},
     )
-    shuffle_calibration_samples: bool | None = field(
-        default=True,
+    shuffle_calibration_samples: bool = field(
+        default=False,
         metadata={
             "help": "whether to shuffle the dataset before selecting calibration data"
         },
@@ -142,7 +158,7 @@ class DatasetArguments(CustomDatasetArguments):
     )
     preprocessing_num_workers: int | None = field(
         default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        metadata={"help": "The number of workers to use for dataset processing."},
     )
     pad_to_max_length: bool = field(
         default=True,
@@ -214,6 +230,14 @@ class DatasetArguments(CustomDatasetArguments):
             "definition"
         },
     )
+    offload_sequential_activations: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to offload intermediate activations between sequential "
+            "layers to the CPU. Disabling offloading is much faster, but uses "
+            "signficiantly more memory. Default is True."
+        },
+    )
     quantization_aware_calibration: bool = field(
         default=True,
         metadata={