huggingface
diff --git a/‎Dockerfile_gaudi‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile_gaudi‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/gaudi/server/pyproject.toml‎
Lines changed: 2 additions & 3 deletions b/‎backends/gaudi/server/pyproject.toml‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/gaudi/server/requirements.txt‎
Lines changed: 2 additions & 3 deletions b/‎backends/gaudi/server/requirements.txt‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/cli.py‎
Lines changed: 13 additions & 76 deletions b/‎backends/gaudi/server/text_generation_server/cli.py‎
Lines changed: 13 additions & 76 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/habana_quantization_env.py‎
Lines changed: 0 additions & 53 deletions b/‎backends/gaudi/server/text_generation_server/habana_quantization_env.py‎
Lines changed: 0 additions & 53 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 1 addition & 72 deletions b/‎backends/gaudi/server/text_generation_server/models/__init__.py‎
Lines changed: 1 addition & 72 deletions
diff --git a/‎backends/gaudi/server/text_generation_server/models/bloom.py‎
Lines changed: 0 additions & 52 deletions b/‎backends/gaudi/server/text_generation_server/models/bloom.py‎
Lines changed: 0 additions & 52 deletions
@@ -57,7 +57,7 @@ ARG PYTORCH_VERSION
 
 FROM vault.habana.ai/gaudi-docker/${HABANA_VERSION}/ubuntu22.04/habanalabs/pytorch-installer-${PYTORCH_VERSION}:latest AS base
 
-ENV ATTENTION=default
+ENV ATTENTION=paged
 ENV PREFIX_CACHING=0
 ENV PREFILL_CHUNKING=0
 ENV PT_HPU_LAZY_MODE=1
 
@@ -22,10 +22,9 @@ opentelemetry-instrumentation-grpc = "^0.53b0"
 hf-transfer = "^0.1.9"
 sentencepiece = "^0.2.0"
 peft = "^0.15"
-optimum-habana = "1.17"
-transformers = "^4.49"
+transformers = "^4.52.4"
 numpy = "^1.26"
-accelerate = "^0.33"
+accelerate = "^1.7.0"
 outlines= { version = "^0.0.36", optional = true }
 prometheus-client = "^0.21.1"
 py-cpuinfo = "^9.0.0"
 
@@ -1,4 +1,4 @@
-accelerate==0.33.0 ; python_version >= "3.9" and python_version < "3.13"
+accelerate==1.7.0 ; python_version >= "3.9" and python_version < "3.13"
 annotated-types==0.7.0 ; python_version >= "3.9" and python_version < "3.13"
 attrs==25.3.0 ; python_version >= "3.9" and python_version < "3.13"
 certifi==2025.1.31 ; python_version >= "3.9" and python_version < "3.13"
@@ -46,7 +46,6 @@ opentelemetry-instrumentation==0.53b0 ; python_version >= "3.9" and python_versi
 opentelemetry-proto==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-sdk==1.32.0 ; python_version >= "3.9" and python_version < "3.13"
 opentelemetry-semantic-conventions==0.53b0 ; python_version >= "3.9" and python_version < "3.13"
-optimum-habana==1.17.0 ; python_version >= "3.9" and python_version < "3.13"
 optimum==1.24.0 ; python_version >= "3.9" and python_version < "3.13"
 outlines==0.0.36 ; python_version >= "3.9" and python_version < "3.13"
 packaging==24.2 ; python_version >= "3.9" and python_version < "3.13"
@@ -76,7 +75,7 @@ sympy==1.13.1 ; python_version >= "3.9" and python_version < "3.13"
 threadpoolctl==3.6.0 ; python_version >= "3.9" and python_version < "3.13"
 tokenizers==0.21.1 ; python_version >= "3.9" and python_version < "3.13"
 tqdm==4.67.1 ; python_version >= "3.9" and python_version < "3.13"
-transformers==4.49.0 ; python_version >= "3.9" and python_version < "3.13"
+transformers==4.52.4 ; python_version >= "3.9" and python_version < "3.13"
 triton==3.2.0 ; python_version >= "3.9" and python_version < "3.13" and platform_system == "Linux" and platform_machine == "x86_64"
 typer==0.15.2 ; python_version >= "3.9" and python_version < "3.13"
 typing-extensions==4.13.2 ; python_version >= "3.9" and python_version < "3.13"
 
@@ -1,6 +1,4 @@
 import os
-import psutil
-import signal
 import sys
 import typer
 
@@ -115,80 +113,19 @@ def serve(
         raise RuntimeError(
             "Only 1 can be set between `dtype` and `quantize`, as they both decide how goes the final model."
         )
-
-    logger.info("CLI SHARDED = {} DTYPE = {}".format(sharded, dtype))
-
-    if sharded and os.getenv("ATTENTION", "default") not in {"paged"}:
-        tgi_file = Path(__file__).resolve().parent / "tgi_service.py"
-        num_shard = int(os.getenv("WORLD_SIZE", "1"))
-        logger.info("CLI SHARDED = {}".format(num_shard))
-        import subprocess
-
-        cmd = (
-            f"deepspeed --num_nodes 1 --num_gpus {num_shard} --no_local_rank {tgi_file}"
-        )
-        cmd += f" --model_id {model_id} --revision {revision} --sharded {sharded}"
-        cmd += f" --dtype {dtype} --trust_remote_code {trust_remote_code} --uds_path {uds_path}"
-        cmd += f" --quantize {quantize} --max_input_tokens {max_input_tokens}"
-        if speculate is not None:
-            cmd += f"--speculate {speculate}"
-        logger.info("CLI server start deepspeed ={} ".format(cmd))
-        sys.stdout.flush()
-        sys.stderr.flush()
-        with subprocess.Popen(cmd, shell=True, executable="/bin/bash") as proc:
-            do_terminate = False
-            current_handler = signal.getsignal(signal.SIGTERM)
-
-            def terminate_handler(sig, frame):
-                nonlocal do_terminate
-                do_terminate = True
-                if callable(current_handler):
-                    current_handler(sig, frame)
-
-            signal.signal(signal.SIGTERM, terminate_handler)
-
-            finished = False
-            while not finished:
-                try:
-                    if do_terminate:
-                        parent = psutil.Process(proc.pid)
-                        all_procs = parent.children(recursive=True) + [parent]
-                        for p in all_procs:
-                            try:
-                                p.terminate()
-                            except psutil.NoSuchProcess:
-                                pass
-                        _, alive = psutil.wait_procs(all_procs, timeout=30)
-                        for p in alive:
-                            p.kill()
-
-                        do_terminate = False
-
-                    proc.wait(timeout=3)
-                except subprocess.TimeoutExpired:
-                    pass
-                else:
-                    finished = True
-
-            sys.stdout.flush()
-            sys.stderr.flush()
-            if proc.returncode != 0:
-                logger.error(f"{cmd}  exited with status = {proc.returncode}")
-                return proc.returncode
-    else:
-        server.serve(
-            model_id,
-            lora_adapters,
-            revision,
-            sharded,
-            quantize,
-            speculate,
-            dtype,
-            kv_cache_dtype,
-            trust_remote_code,
-            uds_path,
-            max_input_tokens,
-        )
+    server.serve(
+        model_id,
+        lora_adapters,
+        revision,
+        sharded,
+        quantize,
+        speculate,
+        dtype,
+        kv_cache_dtype,
+        trust_remote_code,
+        uds_path,
+        max_input_tokens,
+    )
 
 
 @app.command()
 
@@ -5,7 +5,6 @@
 
 from loguru import logger
 from transformers.configuration_utils import PretrainedConfig
-from transformers.models.auto import modeling_auto
 from huggingface_hub import hf_hub_download, HfApi
 from typing import Optional
 from pathlib import Path
@@ -36,14 +35,10 @@
     "Seq2SeqLM",
     "get_model_with_lora_adapters",
 ]
-from text_generation_server.models.globals import ATTENTION
 
 VLM_BATCH_TYPES = set()
-FLASH_ATT_ERROR_MESSAGE = "{} requires Flash Attention enabled models."
 
-FLASH_ATTENTION = False
-if ATTENTION == "paged":
-    FLASH_ATTENTION = True
+FLASH_ATTENTION = True
 
 try:
     from text_generation_server.models.flash_causal_lm import FlashCausalLM
@@ -883,72 +878,6 @@ def get_model(
                 trust_remote_code=trust_remote_code,
             )
 
-    from text_generation_server.models.causal_lm import CausalLM
-    from text_generation_server.models.vlm_causal_lm import VlmCausalLM
-    from text_generation_server.models.custom_modeling.mllama import (
-        MllamaForConditionalGeneration,
-    )
-    from text_generation_server.models.custom_modeling.llava_next import (
-        LlavaNextForConditionalGeneration,
-    )
-    from text_generation_server.models.vlm_causal_lm import (
-        VlmCausalLMBatch,
-    )
-
-    VLM_BATCH_TYPES.add(VlmCausalLMBatch)
-
-    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
-
-    adapt_transformers_to_gaudi()
-    if SDP_ON_BF16 == 1:
-        torch._C._set_math_sdp_allow_fp16_bf16_reduction(True)
-    if model_type == "gpt_bigcode":
-        from text_generation_server.models.starcoder import StarCoder
-
-        return StarCoder(model_id=model_id, revision=revision, dtype=dtype)
-    if model_type == "bloom":
-        from text_generation_server.models.bloom import BLOOM
-
-        return BLOOM(
-            model_id=model_id,
-            revision=revision,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    if model_type == "llava_next":
-        return VlmCausalLM(
-            model_class=LlavaNextForConditionalGeneration,
-            model_id=model_id,
-            revision=revision,
-            quantize=None,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    if model_type == "mllama":
-        return VlmCausalLM(
-            model_class=MllamaForConditionalGeneration,
-            model_id=model_id,
-            revision=revision,
-            quantize=None,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
-    if model_type in modeling_auto.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
-        return CausalLM(
-            model_id,
-            revision,
-            quantize=quantize,
-            speculator=speculator,
-            dtype=dtype,
-            trust_remote_code=trust_remote_code,
-        )
-
     raise ValueError(f"Unsupported model type {model_type}")