b2txt25/model_training_nnn_tpu/launch_tpu_training.py

#!/usr/bin/env python3
"""
TPU Training Launch Script for Brain-to-Text RNN Model

This script provides easy TPU training setup using Accelerate library.
Supports both single TPU core and multi-core (8 cores) training.

Usage:
    python launch_tpu_training.py --config rnn_args.yaml --num_cores 8

Requirements:
    - PyTorch XLA installed
    - Accelerate library installed
    - TPU runtime available
"""

import argparse
import yaml
import os
import sys
from pathlib import Path

def update_config_for_tpu(config_path, num_cores=8):
    """
    Update configuration file to enable TPU training
    """
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)

    # Enable TPU settings
    config['use_tpu'] = True
    config['num_tpu_cores'] = num_cores
    config['dataloader_num_workers'] = 0  # Required for TPU
    config['use_amp'] = True  # Enable mixed precision with bfloat16

    # Adjust batch size and gradient accumulation for multi-core TPU
    if num_cores > 1:
        # Distribute batch size across cores
        original_batch_size = config['dataset']['batch_size']
        config['dataset']['batch_size'] = max(1, original_batch_size // num_cores)
        config['gradient_accumulation_steps'] = max(1, config.get('gradient_accumulation_steps', 1))

        print(f"Adjusted batch size from {original_batch_size} to {config['dataset']['batch_size']} per core")
        print(f"Gradient accumulation steps: {config['gradient_accumulation_steps']}")

    # Save updated config
    tpu_config_path = config_path.replace('.yaml', '_tpu.yaml')
    with open(tpu_config_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"TPU configuration saved to: {tpu_config_path}")
    return tpu_config_path

def check_tpu_environment():
    """
    Check if TPU environment is properly set up
    """
    try:
        import torch_xla
        import torch_xla.core.xla_model as xm

        # Check if TPUs are available
        device = xm.xla_device()
        print(f"TPU device available: {device}")
        print(f"TPU ordinal: {xm.get_ordinal()}")
        print(f"TPU world size: {xm.xrt_world_size()}")

        return True
    except ImportError:
        print("ERROR: torch_xla not installed. Please install PyTorch XLA for TPU support.")
        return False
    except Exception as e:
        print(f"ERROR: TPU not available - {e}")
        return False

def run_tpu_training(config_path, num_cores=8):
    """
    Launch TPU training using accelerate
    """
    # Check TPU environment
    if not check_tpu_environment():
        sys.exit(1)

    # Update config for TPU
    tpu_config_path = update_config_for_tpu(config_path, num_cores)

    # Set TPU environment variables BEFORE launching training
    os.environ['TPU_CORES'] = str(num_cores)
    os.environ['XLA_USE_BF16'] = '1'  # Enable bfloat16

    # Critical XLA multi-threading settings - must be set before torch_xla import
    cpu_count = os.cpu_count()
    os.environ['XLA_FLAGS'] = (
        '--xla_cpu_multi_thread_eigen=true '
        '--xla_cpu_enable_fast_math=true '
        f'--xla_force_host_platform_device_count={cpu_count}'
    )
    os.environ['PYTORCH_XLA_COMPILATION_THREADS'] = str(cpu_count)

    print(f"Set XLA compilation to use {cpu_count} CPU threads")
    print(f"XLA_FLAGS: {os.environ['XLA_FLAGS']}")
    print(f"PYTORCH_XLA_COMPILATION_THREADS: {os.environ['PYTORCH_XLA_COMPILATION_THREADS']}")

    # Launch training with accelerate using subprocess to ensure environment variables are passed
    cmd = f"accelerate launch --tpu --num_processes {num_cores} train_model.py --config_path {tpu_config_path}"

    print(f"Launching TPU training with command:")
    print(f"  {cmd}")
    print(f"Using {num_cores} TPU cores")
    print("-" * 60)

    # Use subprocess to ensure environment variables are properly inherited
    import subprocess

    # Create environment with our XLA settings
    env = os.environ.copy()
    env.update({
        'TPU_CORES': str(num_cores),
        'XLA_USE_BF16': '1',
        'XLA_FLAGS': (
            '--xla_cpu_multi_thread_eigen=true '
            '--xla_cpu_enable_fast_math=true '
            f'--xla_force_host_platform_device_count={cpu_count}'
        ),
        'PYTORCH_XLA_COMPILATION_THREADS': str(cpu_count)
    })

    print(f"Environment variables set for subprocess:")
    print(f"  XLA_FLAGS: {env['XLA_FLAGS']}")
    print(f"  PYTORCH_XLA_COMPILATION_THREADS: {env['PYTORCH_XLA_COMPILATION_THREADS']}")
    print("-" * 60)

    # Execute training with proper environment
    result = subprocess.run(cmd.split(), env=env)
    return result.returncode

def main():
    parser = argparse.ArgumentParser(description='Launch TPU training for Brain-to-Text RNN')
    parser.add_argument('--config', default='rnn_args.yaml',
                       help='Path to configuration file (default: rnn_args.yaml)')
    parser.add_argument('--num_cores', type=int, default=8,
                       help='Number of TPU cores to use (default: 8)')
    parser.add_argument('--check_only', action='store_true',
                       help='Only check TPU environment, do not launch training')

    args = parser.parse_args()

    # Verify config file exists
    if not os.path.exists(args.config):
        print(f"ERROR: Configuration file {args.config} not found")
        sys.exit(1)

    if args.check_only:
        check_tpu_environment()
        return

    # Run TPU training
    run_tpu_training(args.config, args.num_cores)

if __name__ == "__main__":
    main()
tpu 2025-10-15 14:26:11 +08:00			`#!/usr/bin/env python3`
			`"""`
			`TPU Training Launch Script for Brain-to-Text RNN Model`

			`This script provides easy TPU training setup using Accelerate library.`
			`Supports both single TPU core and multi-core (8 cores) training.`

			`Usage:`
			`python launch_tpu_training.py --config rnn_args.yaml --num_cores 8`

			`Requirements:`
			`- PyTorch XLA installed`
			`- Accelerate library installed`
			`- TPU runtime available`
			`"""`

			`import argparse`
			`import yaml`
			`import os`
			`import sys`
			`from pathlib import Path`

			`def update_config_for_tpu(config_path, num_cores=8):`
			`"""`
			`Update configuration file to enable TPU training`
			`"""`
			`with open(config_path, 'r') as f:`
			`config = yaml.safe_load(f)`

			`# Enable TPU settings`
			`config['use_tpu'] = True`
			`config['num_tpu_cores'] = num_cores`
			`config['dataloader_num_workers'] = 0 # Required for TPU`
			`config['use_amp'] = True # Enable mixed precision with bfloat16`

			`# Adjust batch size and gradient accumulation for multi-core TPU`
			`if num_cores > 1:`
			`# Distribute batch size across cores`
			`original_batch_size = config['dataset']['batch_size']`
			`config['dataset']['batch_size'] = max(1, original_batch_size // num_cores)`
			`config['gradient_accumulation_steps'] = max(1, config.get('gradient_accumulation_steps', 1))`

			`print(f"Adjusted batch size from {original_batch_size} to {config['dataset']['batch_size']} per core")`
			`print(f"Gradient accumulation steps: {config['gradient_accumulation_steps']}")`

			`# Save updated config`
			`tpu_config_path = config_path.replace('.yaml', '_tpu.yaml')`
			`with open(tpu_config_path, 'w') as f:`
			`yaml.dump(config, f, default_flow_style=False)`

			`print(f"TPU configuration saved to: {tpu_config_path}")`
			`return tpu_config_path`

			`def check_tpu_environment():`
			`"""`
			`Check if TPU environment is properly set up`
			`"""`
			`try:`
			`import torch_xla`
			`import torch_xla.core.xla_model as xm`

			`# Check if TPUs are available`
			`device = xm.xla_device()`
			`print(f"TPU device available: {device}")`
			`print(f"TPU ordinal: {xm.get_ordinal()}")`
			`print(f"TPU world size: {xm.xrt_world_size()}")`

			`return True`
			`except ImportError:`
			`print("ERROR: torch_xla not installed. Please install PyTorch XLA for TPU support.")`
			`return False`
			`except Exception as e:`
			`print(f"ERROR: TPU not available - {e}")`
			`return False`

			`def run_tpu_training(config_path, num_cores=8):`
			`"""`
			`Launch TPU training using accelerate`
			`"""`
			`# Check TPU environment`
			`if not check_tpu_environment():`
			`sys.exit(1)`

			`# Update config for TPU`
			`tpu_config_path = update_config_for_tpu(config_path, num_cores)`

			`# Set TPU environment variables BEFORE launching training`
			`os.environ['TPU_CORES'] = str(num_cores)`
			`os.environ['XLA_USE_BF16'] = '1' # Enable bfloat16`

			`# Critical XLA multi-threading settings - must be set before torch_xla import`
			`cpu_count = os.cpu_count()`
			`os.environ['XLA_FLAGS'] = (`
			`'--xla_cpu_multi_thread_eigen=true '`
			`'--xla_cpu_enable_fast_math=true '`
			`f'--xla_force_host_platform_device_count={cpu_count}'`
			`)`
			`os.environ['PYTORCH_XLA_COMPILATION_THREADS'] = str(cpu_count)`

			`print(f"Set XLA compilation to use {cpu_count} CPU threads")`
			`print(f"XLA_FLAGS: {os.environ['XLA_FLAGS']}")`
			`print(f"PYTORCH_XLA_COMPILATION_THREADS: {os.environ['PYTORCH_XLA_COMPILATION_THREADS']}")`

			`# Launch training with accelerate using subprocess to ensure environment variables are passed`
			`cmd = f"accelerate launch --tpu --num_processes {num_cores} train_model.py --config_path {tpu_config_path}"`

			`print(f"Launching TPU training with command:")`
			`print(f" {cmd}")`
			`print(f"Using {num_cores} TPU cores")`
			`print("-" * 60)`

			`# Use subprocess to ensure environment variables are properly inherited`
			`import subprocess`

			`# Create environment with our XLA settings`
			`env = os.environ.copy()`
			`env.update({`
			`'TPU_CORES': str(num_cores),`
			`'XLA_USE_BF16': '1',`
			`'XLA_FLAGS': (`
			`'--xla_cpu_multi_thread_eigen=true '`
			`'--xla_cpu_enable_fast_math=true '`
			`f'--xla_force_host_platform_device_count={cpu_count}'`
			`),`
			`'PYTORCH_XLA_COMPILATION_THREADS': str(cpu_count)`
			`})`

			`print(f"Environment variables set for subprocess:")`
			`print(f" XLA_FLAGS: {env['XLA_FLAGS']}")`
			`print(f" PYTORCH_XLA_COMPILATION_THREADS: {env['PYTORCH_XLA_COMPILATION_THREADS']}")`
			`print("-" * 60)`

			`# Execute training with proper environment`
			`result = subprocess.run(cmd.split(), env=env)`
			`return result.returncode`

			`def main():`
			`parser = argparse.ArgumentParser(description='Launch TPU training for Brain-to-Text RNN')`
			`parser.add_argument('--config', default='rnn_args.yaml',`
			`help='Path to configuration file (default: rnn_args.yaml)')`
			`parser.add_argument('--num_cores', type=int, default=8,`
			`help='Number of TPU cores to use (default: 8)')`
			`parser.add_argument('--check_only', action='store_true',`
			`help='Only check TPU environment, do not launch training')`

			`args = parser.parse_args()`

			`# Verify config file exists`
			`if not os.path.exists(args.config):`
			`print(f"ERROR: Configuration file {args.config} not found")`
			`sys.exit(1)`

			`if args.check_only:`
			`check_tpu_environment()`
			`return`

			`# Run TPU training`
			`run_tpu_training(args.config, args.num_cores)`

			`if __name__ == "__main__":`
			`main()`