tpu maual dataloader

2025-10-12 20:43:43 +08:00
parent bc015f5efb
commit 7cc9c41b7f
1 changed files with 76 additions and 31 deletions
--- a/model_training_nnn/rnn_trainer.py
+++ b/model_training_nnn/rnn_trainer.py
@@ -49,7 +49,7 @@ class BrainToTextDecoder_Trainer:
        )
        # Set even_batches to False after initialization - required for batch_size=None DataLoaders
-        self.accelerator.even_batches = False
+        # Note: This may not be settable in all Accelerate versions, but we handle it in DataLoader config
        # Trainer fields
        self.args = args
@@ -280,6 +280,23 @@ class BrainToTextDecoder_Trainer:
                param.requires_grad = False
        # Prepare model, optimizer, scheduler, and dataloaders for distributed training
        # For TPU, don't prepare DataLoaders with Accelerator to avoid batch_sampler issues
        use_tpu = self.args.get('use_tpu', False)
        if use_tpu:
            # On TPU, only prepare model, optimizer, and scheduler
            (
                self.model,
                self.optimizer,
                self.learning_rate_scheduler,
            ) = self.accelerator.prepare(
                self.model,
                self.optimizer,
                self.learning_rate_scheduler,
            )
            # DataLoaders remain unprepared but will work with our custom configuration
        else:
            # Standard GPU/CPU preparation including DataLoaders
            (
                self.model,
                self.optimizer,
@@ -563,7 +580,17 @@ class BrainToTextDecoder_Trainer:
            # Train step
            start_time = time.time()
-            # Data is automatically moved to device by Accelerator
+            # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
            use_tpu = self.args.get('use_tpu', False)
            if use_tpu:
                # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
                features = batch['input_features'].to(self.device)
                labels = batch['seq_class_ids'].to(self.device)
                n_time_steps = batch['n_time_steps'].to(self.device)
                phone_seq_lens = batch['phone_seq_lens'].to(self.device)
                day_indicies = batch['day_indicies'].to(self.device)
            else:
                # For GPU/CPU, data is automatically moved to device by Accelerator
                features = batch['input_features']
                labels = batch['seq_class_ids']
                n_time_steps = batch['n_time_steps']
@@ -732,7 +759,17 @@ class BrainToTextDecoder_Trainer:
        for i, batch in enumerate(loader):
-            # Data is automatically moved to device by Accelerator
+            # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
            use_tpu = self.args.get('use_tpu', False)
            if use_tpu:
                # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
                features = batch['input_features'].to(self.device)
                labels = batch['seq_class_ids'].to(self.device)
                n_time_steps = batch['n_time_steps'].to(self.device)
                phone_seq_lens = batch['phone_seq_lens'].to(self.device)
                day_indicies = batch['day_indicies'].to(self.device)
            else:
                # For GPU/CPU, data is automatically moved to device by Accelerator
                features = batch['input_features']
                labels = batch['seq_class_ids']
                n_time_steps = batch['n_time_steps']
@@ -838,7 +875,15 @@ class BrainToTextDecoder_Trainer:
        '''
        self.model.eval()
-        # Data is automatically moved to device by Accelerator
+        # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
        use_tpu = self.args.get('use_tpu', False)
        if use_tpu:
            # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
            features = batch['input_features'].to(self.device)
            day_indicies = batch['day_indicies'].to(self.device)
            n_time_steps = batch['n_time_steps'].to(self.device)
        else:
            # For GPU/CPU, data is automatically moved to device by Accelerator
            features = batch['input_features']
            day_indicies = batch['day_indicies']
            n_time_steps = batch['n_time_steps']