diff --git a/hack/optimize/neuron/Dockerfile b/hack/optimize/neuron/Dockerfile new file mode 100644 index 000000000..c2e6aec22 --- /dev/null +++ b/hack/optimize/neuron/Dockerfile @@ -0,0 +1,73 @@ +# Use Ubuntu 20.04 as the base image +FROM ubuntu:20.04 + +# Neuron SDK components versions +ARG NEURONX_FRAMEWORK_VERSION=2.11.0.0 +ARG NEURONX_RUNTIME_LIB_VERSION=2.11.7.0 +ARG NEURONX_TOOLS_VERSION=2.11.8.0 +ARG NEURONX_CC_VERSION=2.11.8.0 + +# Set environment variables +ENV DEBIAN_FRONTEND=noninteractive +ENV PYTHONUNBUFFERED=1 +ENV PYTHONIOENCODING=UTF-8 +ENV LD_LIBRARY_PATH="/opt/aws/neuron/lib:/usr/local/lib" +ENV PATH="/opt/aws/neuron/bin:$PATH" + +# Install system dependencies including libsqlite3-dev and libbz2-dev for Python +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + wget \ + zlib1g-dev \ + gnupg2 \ + libssl-dev \ + libffi-dev \ + libsqlite3-dev \ + libbz2-dev \ + libopenblas-dev \ + libomp5 \ + && rm -rf /var/lib/apt/lists/* + +# Add Neuron repository and install Neuron SDK components +RUN echo "deb https://apt.repos.neuron.amazonaws.com focal main" > /etc/apt/sources.list.d/neuron.list && \ + wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-NEURON.PUB | apt-key add - && \ + apt-get update && \ + apt-get install -y \ + aws-neuronx-tools=${NEURONX_TOOLS_VERSION} \ + aws-neuronx-runtime-lib=${NEURONX_RUNTIME_LIB_VERSION} \ + && rm -rf /var/lib/apt/lists/* + +# Install Python 3.10 with sqlite3 and bz2 support +RUN wget -q https://www.python.org/ftp/python/3.10.12/Python-3.10.12.tgz && \ + tar -xzf Python-3.10.12.tgz && \ + cd Python-3.10.12 && \ + ./configure --enable-shared --enable-optimizations --with-ensurepip=install && \ + make -j $(nproc) && make install && \ + cd .. && rm -rf Python-3.10.12* + +# Upgrade pip and install required Python packages +RUN python3.10 -m pip install --upgrade pip + +# Install Neuron-related Python packages from the Neuron repository +RUN python3.10 -m pip install --no-cache-dir \ + --extra-index-url https://pip.repos.neuron.amazonaws.com \ + torch-neuronx==${NEURONX_FRAMEWORK_VERSION} \ + torch-xla==1.13.* \ + torchvision + +# Install additional Python packages +RUN python3.10 -m pip install --no-cache-dir \ + transformers==4.29 \ + numpy==1.23 \ + pynvml + +# Set the working directory +WORKDIR /app + +# Copy training and inference scripts +COPY train_bert_neuron.py /app/train_bert_neuron.py +COPY infer_bert_neuron.py /app/infer_bert_neuron.py + diff --git a/hack/optimize/neuron/infer_bert_neuron.py b/hack/optimize/neuron/infer_bert_neuron.py new file mode 100644 index 000000000..9936b654b --- /dev/null +++ b/hack/optimize/neuron/infer_bert_neuron.py @@ -0,0 +1,82 @@ +import os + +# Unset XLA_FLAGS to avoid GPU-specific issues on Neuron +os.environ.pop('XLA_FLAGS', None) + +import torch +import torch_neuronx +from transformers import BertTokenizer, BertForPreTraining +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + return TensorDataset( + tokenized_inputP1+rOQ\P1+rOR\P1+rOS\s.input_ids, + tokenized_inputs.attention_mask, + labels, + next_sentence_labels, + ) + +def infer_bert_neuron(model, tokenizer, batch_sizes, device): + dataset = create_dummy_data(tokenizer) + results = [] + + for batch_size in batch_sizes: + try: + dataloader = DataLoader(dataset, batch_size=batch_size) + start_time = time.time() + for batch in dataloader: + inputs, masks, labels, next_sentence_labels = batch + inputs, masks = inputs.to(device), masks.to(device) + outputs = model(input_ids=inputs, attention_mask=masks) + end_time = time.time() + inference_time = end_time - start_time + throughput = len(dataset) / inference_time + + print(f"Batch Size: {batch_size}") + print(f"Inference time: {inference_time:.2f} seconds") + print(f"Throughput: {throughput:.2f} samples/second") + + results.append({ + 'batch_size': batch_size, + 'throughput': throughput, + }) + break # Exit after successful batch size + + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + + print("Optimal Batch Size Found:") + for res in results: + print(f"Batch Size: {res['batch_size']}, Throughput: {res['throughput']:.2f} samples/sec") + +def main(): + device = torch.device("xla") + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + + example_inputs = torch.randint(0, 2000, (1, 128)).to(device) + model_neuron = torch_neuronx.trace(model, example_inputs) + + batch_sizes = [128, 64, 32, 16, 8] + infer_bert_neuron(model_neuron, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() + diff --git a/hack/optimize/neuron/train_bert_neuron.py b/hack/optimize/neuron/train_bert_neuron.py new file mode 100644 index 000000000..6b9c32436 --- /dev/null +++ b/hack/optimize/neuron/train_bert_neuron.py @@ -0,0 +1,103 @@ +import os + +# Unset XLA_FLAGS to avoid GPU-specific issues on Neuron +os.environ.pop('XLA_FLAGS', None) + +import time +import torch +import torch_xla +import torch_xla.core.xla_model as xm +from transformers import BertForPreTraining, BertTokenizer +from torch.utils.data import DataLoader, TensorDataset + +def create_dummy_data(tokenizer, num_samples=1000, max_length=128): + sentences = [ + f"This is a dummy sentence number {i}" for i in range(num_samples) + ] + tokenized_inputs = tokenizer( + sentences, + max_length=max_length, + padding="max_length", + truncation=True, + return_tensors="pt", + ) + labels = tokenized_inputs.input_ids.detach().clone() + next_sentence_labels = torch.randint(0, 2, (num_samples,)) + return TensorDataset( + tokenized_inputs.input_ids, + tokenized_inputs.attention_mask, + labels, + next_sentence_labels, + ) + +def train_bert_neuron(model, tokenizer, batch_sizes, device): + model.train() + model.to(device) + + dataset = create_dummy_data(tokenizer) + results = [] + + for batch_size in batch_sizes: + try: + train_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True) + optimizer = torch.optim.AdamW(model.parameters(), lr=0.001) + + # Measure training time for throughput calculation + start_time = time.time() + for batch in train_dataloader: + optimizer.zero_grad() + inputs, masks, labels, next_sentence_labels = batch + inputs, masks, labels, next_sentence_labels = ( + inputs.to(device), + masks.to(device), + labels.to(device), + next_sentence_labels.to(device), + ) + outputs = model( + input_ids=inputs, + attention_mask=masks, + labels=labels, + next_sentence_label=next_sentence_labels, + ) + loss = outputs.loss + loss.backward() + optimizer.step() + end_time = time.time() + training_time = end_time - start_time + throughput = len(dataset) / training_time + + print(f"Batch Size: {batch_size}") + print(f"Training time: {training_time:.2f} seconds") + print(f"Throughput: {throughput:.2f} samples/second") + + results.append({ + 'batch_size': batch_size, + 'throughput': throughput, + }) + break # Exit after successful batch size + + except RuntimeError as e: + if 'out of memory' in str(e).lower(): + print(f"Batch Size {batch_size}: Out of Memory. Trying smaller batch size.") + torch.cuda.empty_cache() + continue + else: + raise e + + print("Optimal Batch Size Found:") + for res in results: + print(f"Batch Size: {res['batch_size']}, Throughput: {res['throughput']:.2f} samples/sec") + +def main(): + device = xm.xla_device() + + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + model = BertForPreTraining.from_pretrained("bert-base-uncased") + + batch_sizes = [128, 64, 32, 16, 8] + + train_bert_neuron(model, tokenizer, batch_sizes, device) + +if __name__ == "__main__": + main() +