Inference optimization for cache-aware pipelines (NVIDIA-NeMo#15035)

naymaraq · naymaraq · genquan9 · commit aa3d1cf2a425 · 2025-11-20T21:27:51.000Z
* optimize context manager and cache feature bufferer

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;

* speedUp cache_feature_bufferer

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;

* improved docstring in BatchedCacheFeatureBufferer

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;

---------

Signed-off-by: naymaraq &lt;dkaramyan@nvidia.com&gt;
Co-authored-by: naymaraq &lt;dkaramyan@nvidia.com&gt;
Signed-off-by: genquan9 &lt;genquan@google.com&gt;
diff --git a/nemo/collections/asr/inference/model_wrappers/asr_inference_wrapper.py b/nemo/collections/asr/inference/model_wrappers/asr_inference_wrapper.py
@@ -140,8 +140,6 @@ def supports_capitalization(self) -> bool:
         Returns:
             (bool) True if the ASR model supports capitalization, False otherwise.
         """
-        if not hasattr(self, "asr_model") or self.asr_model is None:
-            raise ValueError("ASR model is not initialized.")
         return self.tokenizer.supports_capitalization
 
     def supports_punctuation(self) -> bool:
@@ -150,8 +148,6 @@ def supports_punctuation(self) -> bool:
         Returns:
             (bool) True if the ASR model supports punctuation, False otherwise.
         """
-        if not hasattr(self, "asr_model") or self.asr_model is None:
-            raise ValueError("ASR model is not initialized.")
         return self.supported_punctuation() != set()
 
     def supported_punctuation(self) -> set:
diff --git a/nemo/collections/asr/inference/pipelines/base_pipeline.py b/nemo/collections/asr/inference/pipelines/base_pipeline.py
@@ -79,6 +79,10 @@ def from_state(cls, state: StreamingState, request: Request, sep: str = ' ') ->
         """
         final_transcript = state.final_transcript.strip()
         final_segments = [seg.copy() for seg in state.final_segments]
+        if len(final_segments) > 0:
+            final_segments[0].text = final_segments[0].text.lstrip(sep)
+            final_segments[-1].text = final_segments[-1].text.rstrip(sep)
+
         if final_transcript:
             separator = ''
             if not request.is_first and state.concat_with_space:
@@ -185,11 +189,12 @@ def transcribe_step(self, requests: list[Request]) -> list[TranscribeStepOutput]
 
         # Create current step output for each request
         outputs = []
+        sep = self.get_sep()
         for request in requests:
 
             # Extract current step output from the state
             state = self.get_state(request.stream_id)
-            step_output = TranscribeStepOutput.from_state(state=state, request=request, sep=self.get_sep())
+            step_output = TranscribeStepOutput.from_state(state=state, request=request, sep=sep)
             outputs.append(step_output)
 
             # Cleanup the state after the response is sent
@@ -344,6 +349,7 @@ def init_bufferer_for_cache_aware_streaming(self) -> None:
         check_existance_of_required_attributes(
             self,
             [
+                'num_slots',
                 'use_feat_cache',
                 'chunk_size_in_secs',
                 'buffer_size_in_secs',
@@ -361,6 +367,7 @@ def init_bufferer_for_cache_aware_streaming(self) -> None:
             chunk_size_for_feature_buffer = self.buffer_size_in_secs
 
         self.bufferer = BatchedCacheFeatureBufferer(
+            num_slots=self.num_slots,
             sample_rate=self.sample_rate,
             buffer_size_in_secs=self.buffer_size_in_secs,
             chunk_size_in_secs=chunk_size_for_feature_buffer,
@@ -406,6 +413,7 @@ def run(
         request_generator.set_progress_bar(progress_bar)
 
         pipeline_output = {}
+        sep = self.get_sep()
         self.open_session()
         for requests in request_generator:
             step_outputs = self.transcribe_step(requests)
@@ -417,7 +425,18 @@ def run(
                         "segments": [],
                         "audio_filepath": request_generator.get_audio_filepath(stream_id),
                     }
-                pipeline_output[stream_id]["text"] += step_output.final_transcript
-                pipeline_output[stream_id]["segments"].extend(step_output.final_segments)
+
+                accumulated_text = pipeline_output[stream_id]["text"]
+                final_transcript = step_output.final_transcript
+                final_segments = step_output.final_segments
+                if not accumulated_text:
+                    final_transcript = final_transcript.lstrip(sep)
+                    if len(final_segments) > 0:
+                        first_segment = final_segments[0]
+                        first_segment.text = first_segment.text.lstrip(sep)
+
+                accumulated_text += final_transcript
+                pipeline_output[stream_id]["text"] = accumulated_text
+                pipeline_output[stream_id]["segments"].extend(final_segments)
         self.close_session()
         return pipeline_output
diff --git a/nemo/collections/asr/inference/pipelines/cache_aware_ctc_pipeline.py b/nemo/collections/asr/inference/pipelines/cache_aware_ctc_pipeline.py
@@ -163,11 +163,6 @@ def init_endpointer(self) -> None:
             residue_tokens_at_end=self.residue_tokens_at_end,
         )
 
-    def reset_session(self) -> None:
-        """Reset the context manager."""
-        self.context_manager.reset()
-        super().reset_session()
-
     def create_state(self, options: ASRRequestOptions) -> CacheAwareCTCStreamingState:
         """
         Create new empty state.
diff --git a/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py b/nemo/collections/asr/inference/pipelines/cache_aware_rnnt_pipeline.py
@@ -165,11 +165,6 @@ def init_endpointer(self) -> None:
             residue_tokens_at_end=self.residue_tokens_at_end,
         )
 
-    def reset_session(self) -> None:
-        """Reset the context manager."""
-        self.context_manager.reset()
-        super().reset_session()
-
     def create_state(self, options: ASRRequestOptions) -> CacheAwareRNNTStreamingState:
         """
         Create new empty state.
diff --git a/nemo/collections/asr/inference/streaming/buffering/cache_feature_bufferer.py b/nemo/collections/asr/inference/streaming/buffering/cache_feature_bufferer.py
diff --git a/nemo/collections/asr/inference/utils/context_manager.py b/nemo/collections/asr/inference/utils/context_manager.py