diff --git a/TPU_ISSUES_RECORD.md b/TPU_ISSUES_RECORD.md index 51f5412..74e6503 100644 --- a/TPU_ISSUES_RECORD.md +++ b/TPU_ISSUES_RECORD.md @@ -226,6 +226,31 @@ All TPU training issues have been systematically identified and fixed: **Ready for TPU training test** with 687M parameter brain-to-text model. +--- + +## New Issue: TPU Memory Exhaustion (2025-10-12 15:00) +``` +RuntimeError: Bad StatusOr access: RESOURCE_EXHAUSTED: Error allocating device buffer: Attempting to allocate 3.50M. That was not possible. There are 2.07M free.; (0x0x0_HBM0) +``` + +**Root Cause**: TPU HBM memory fragmentation with batch_size=64 +- Single batch: 64 × (512 features × 14 patches) × 2 bytes = ~917KB per batch +- Combined with 687M model parameters + gradients + activations → memory exhaustion +- TPU memory allocation is stricter than GPU, requires contiguous blocks + +**Solution**: Memory-optimized configuration +```yaml +# rnn_args.yaml optimizations: +batch_size: 32 # reduced from 64 +gradient_accumulation_steps: 2 # maintains effective batch size of 64 +num_dataloader_workers: 0 # TPU compatibility +``` + +**Memory Calculation**: +- New batch memory: 32 × 7168 × 2 bytes = ~458KB (50% reduction) +- Gradient accumulation maintains training stability +- Effective batch size unchanged: 2 steps × 32 = 64 samples + ## Lessons Learned - **Root Cause**: TPU XLA compiler requires strict dtype consistency across all tensors - **Key Insight**: `torch.eye()` and `torch.zeros()` default to f32 - must explicitly specify dtype diff --git a/model_training_nnn/rnn_args.yaml b/model_training_nnn/rnn_args.yaml index 54c4f79..f0e6847 100644 --- a/model_training_nnn/rnn_args.yaml +++ b/model_training_nnn/rnn_args.yaml @@ -21,7 +21,7 @@ use_amp: true # whether to use automatic mixed precision (AMP) for training # TPU and distributed training settings use_tpu: true # whether to use TPU for training (set to true for TPU) num_tpu_cores: 8 # number of TPU cores to use (full TPU v3-8) -gradient_accumulation_steps: 1 # number of gradient accumulation steps for distributed training +gradient_accumulation_steps: 2 # number of gradient accumulation steps for distributed training (2x32=64 effective batch size) output_dir: trained_models/baseline_rnn # directory to save the trained model and logs checkpoint_dir: trained_models/baseline_rnn/checkpoint # directory to save checkpoints during training @@ -75,13 +75,12 @@ dataset: smooth_kernel_std: 2 # standard deviation of the smoothing kernel applied to the data neural_dim: 512 # dimensionality of the neural data - batch_size: 64 # batch size for training + batch_size: 32 # batch size for training (reduced for TPU memory constraints) n_classes: 41 # number of classes (phonemes) in the dataset max_seq_elements: 500 # maximum number of sequence elements (phonemes) for any trial days_per_batch: 4 # number of randomly-selected days to include in each batch seed: 1 # random seed for reproducibility - num_dataloader_workers: 4 # number of workers for the data loader - dataloader_num_workers: 0 # set to 0 for TPU to avoid multiprocessing issues + num_dataloader_workers: 0 # set to 0 for TPU to avoid multiprocessing issues loader_shuffle: false # whether to shuffle the data loader must_include_days: null # specific days to include in the dataset test_percentage: 0.1 # percentage of data to use for testing