TPU
This commit is contained in:
150
model_training_nnn_tpu/setup_tensorflow_tpu.sh
Normal file
150
model_training_nnn_tpu/setup_tensorflow_tpu.sh
Normal file
@@ -0,0 +1,150 @@
|
||||
#!/bin/bash
|
||||
# Setup script for TensorFlow Brain-to-Text training on TPU v5e-8
|
||||
#
|
||||
# Usage: ./setup_tensorflow_tpu.sh
|
||||
#
|
||||
# This script prepares the environment for training the brain-to-text model
|
||||
# using TensorFlow on TPU v5e-8 hardware.
|
||||
|
||||
set -e # Exit on any error
|
||||
|
||||
echo "=== TensorFlow TPU v5e-8 Setup Script ==="
|
||||
echo "Setting up environment for brain-to-text training..."
|
||||
|
||||
# Check if we're in a TPU environment
|
||||
if [[ -z "${TPU_NAME}" ]] && [[ -z "${COLAB_TPU_ADDR}" ]]; then
|
||||
echo "Warning: TPU environment variables not detected."
|
||||
echo "Make sure you're running on a TPU v5e-8 instance."
|
||||
fi
|
||||
|
||||
# Create conda environment for TensorFlow TPU
|
||||
ENV_NAME="b2txt_tf"
|
||||
echo "Creating conda environment: ${ENV_NAME}"
|
||||
|
||||
if conda env list | grep -q "^${ENV_NAME} "; then
|
||||
echo "Environment ${ENV_NAME} already exists. Activating..."
|
||||
conda activate ${ENV_NAME}
|
||||
else
|
||||
echo "Creating new environment..."
|
||||
conda create -n ${ENV_NAME} python=3.10 -y
|
||||
conda activate ${ENV_NAME}
|
||||
fi
|
||||
|
||||
# Install TensorFlow with TPU support
|
||||
echo "Installing TensorFlow with TPU support..."
|
||||
pip install tensorflow[and-cuda]>=2.15.0
|
||||
|
||||
# Install additional requirements
|
||||
echo "Installing additional requirements..."
|
||||
pip install -r requirements_tf.txt
|
||||
|
||||
# Set up TPU environment variables
|
||||
echo "Configuring TPU environment variables..."
|
||||
|
||||
# Create or update .bashrc with TPU optimizations
|
||||
cat >> ~/.bashrc << 'EOF'
|
||||
|
||||
# TPU v5e-8 Environment Variables
|
||||
export TPU_ML_PLATFORM="TensorFlow"
|
||||
export XLA_USE_BF16=1
|
||||
export TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit"
|
||||
export TPU_MEGACORE=1
|
||||
export LIBTPU_INIT_ARGS="--xla_tpu_spmd_threshold_for_allgather_cse=10000"
|
||||
|
||||
# Disable TensorFlow warnings for cleaner output
|
||||
export TF_CPP_MIN_LOG_LEVEL=2
|
||||
|
||||
# Memory optimizations
|
||||
export TF_FORCE_GPU_ALLOW_GROWTH=true
|
||||
export TF_GPU_THREAD_MODE=gpu_private
|
||||
|
||||
EOF
|
||||
|
||||
# Source the updated .bashrc
|
||||
source ~/.bashrc
|
||||
|
||||
# Test TPU connectivity
|
||||
echo "Testing TPU connectivity..."
|
||||
python3 << 'EOF'
|
||||
import tensorflow as tf
|
||||
print("TensorFlow version:", tf.__version__)
|
||||
|
||||
try:
|
||||
resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
|
||||
tf.config.experimental_connect_to_cluster(resolver)
|
||||
tf.tpu.experimental.initialize_tpu_system(resolver)
|
||||
strategy = tf.distribute.TPUStrategy(resolver)
|
||||
print(f"TPU cluster initialized successfully!")
|
||||
print(f"Number of TPU cores: {strategy.num_replicas_in_sync}")
|
||||
print(f"TPU devices: {tf.config.list_logical_devices('TPU')}")
|
||||
except Exception as e:
|
||||
print(f"TPU initialization failed: {e}")
|
||||
print("You may be running on CPU/GPU instead of TPU")
|
||||
|
||||
# Test mixed precision
|
||||
policy = tf.keras.mixed_precision.Policy('mixed_bfloat16')
|
||||
tf.keras.mixed_precision.set_global_policy(policy)
|
||||
print(f"Mixed precision policy: {policy.name}")
|
||||
EOF
|
||||
|
||||
# Verify data directory exists
|
||||
DATA_DIR="../data/hdf5_data_final"
|
||||
if [ -d "$DATA_DIR" ]; then
|
||||
echo "Data directory found: $DATA_DIR"
|
||||
# Count available sessions
|
||||
SESSION_COUNT=$(ls -d $DATA_DIR/t*.20* 2>/dev/null | wc -l)
|
||||
echo "Available sessions: $SESSION_COUNT"
|
||||
else
|
||||
echo "Warning: Data directory not found at $DATA_DIR"
|
||||
echo "Please ensure the dataset is available before training."
|
||||
fi
|
||||
|
||||
# Create output directories
|
||||
echo "Creating output directories..."
|
||||
mkdir -p trained_models/tensorflow_tpu
|
||||
mkdir -p logs/tensorflow_tpu
|
||||
mkdir -p eval_output
|
||||
|
||||
# Make scripts executable
|
||||
echo "Setting script permissions..."
|
||||
chmod +x train_model_tf.py
|
||||
chmod +x evaluate_model_tf.py
|
||||
|
||||
# Display system information
|
||||
echo "=== System Information ==="
|
||||
echo "Python version: $(python --version)"
|
||||
echo "Conda environment: $CONDA_DEFAULT_ENV"
|
||||
echo "Available memory: $(free -h | grep '^Mem:' | awk '{print $7}')"
|
||||
echo "CPU cores: $(nproc)"
|
||||
|
||||
# Check for GPU/TPU
|
||||
echo "=== Hardware Information ==="
|
||||
if nvidia-smi &> /dev/null; then
|
||||
echo "NVIDIA GPUs detected:"
|
||||
nvidia-smi --list-gpus
|
||||
else
|
||||
echo "No NVIDIA GPUs detected"
|
||||
fi
|
||||
|
||||
if [[ -n "${TPU_NAME}" ]]; then
|
||||
echo "TPU Name: $TPU_NAME"
|
||||
elif [[ -n "${COLAB_TPU_ADDR}" ]]; then
|
||||
echo "Colab TPU Address: $COLAB_TPU_ADDR"
|
||||
else
|
||||
echo "No TPU environment variables detected"
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "=== Setup Complete ==="
|
||||
echo "Environment '$ENV_NAME' is ready for TensorFlow TPU training."
|
||||
echo ""
|
||||
echo "To activate the environment:"
|
||||
echo " conda activate $ENV_NAME"
|
||||
echo ""
|
||||
echo "To start training:"
|
||||
echo " python train_model_tf.py --config_path rnn_args.yaml"
|
||||
echo ""
|
||||
echo "To run evaluation:"
|
||||
echo " python evaluate_model_tf.py --model_path path/to/checkpoint --config_path rnn_args.yaml"
|
||||
echo ""
|
||||
echo "For more options, use --help with any script."
|
||||
Reference in New Issue
Block a user