tpu maual dataloader

2025-10-12 20:43:43 +08:00
parent bc015f5efb
commit 7cc9c41b7f
1 changed files with 76 additions and 31 deletions
--- a/model_training_nnn/rnn_trainer.py
+++ b/model_training_nnn/rnn_trainer.py
@@ -49,7 +49,7 @@ class BrainToTextDecoder_Trainer:
        )

        # Set even_batches to False after initialization - required for batch_size=None DataLoaders
-        self.accelerator.even_batches = False
+        # Note: This may not be settable in all Accelerate versions, but we handle it in DataLoader config

        # Trainer fields
        self.args = args
@@ -280,6 +280,23 @@ class BrainToTextDecoder_Trainer:
                param.requires_grad = False

        # Prepare model, optimizer, scheduler, and dataloaders for distributed training
+        # For TPU, don't prepare DataLoaders with Accelerator to avoid batch_sampler issues
+        use_tpu = self.args.get('use_tpu', False)
+
+        if use_tpu:
+            # On TPU, only prepare model, optimizer, and scheduler
+            (
+                self.model,
+                self.optimizer,
+                self.learning_rate_scheduler,
+            ) = self.accelerator.prepare(
+                self.model,
+                self.optimizer,
+                self.learning_rate_scheduler,
+            )
+            # DataLoaders remain unprepared but will work with our custom configuration
+        else:
+            # Standard GPU/CPU preparation including DataLoaders
            (
                self.model,
                self.optimizer,
@@ -563,7 +580,17 @@ class BrainToTextDecoder_Trainer:
            # Train step
            start_time = time.time()

-            # Data is automatically moved to device by Accelerator
+            # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
+            use_tpu = self.args.get('use_tpu', False)
+            if use_tpu:
+                # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
+                features = batch['input_features'].to(self.device)
+                labels = batch['seq_class_ids'].to(self.device)
+                n_time_steps = batch['n_time_steps'].to(self.device)
+                phone_seq_lens = batch['phone_seq_lens'].to(self.device)
+                day_indicies = batch['day_indicies'].to(self.device)
+            else:
+                # For GPU/CPU, data is automatically moved to device by Accelerator
                features = batch['input_features']
                labels = batch['seq_class_ids']
                n_time_steps = batch['n_time_steps']
@@ -732,7 +759,17 @@ class BrainToTextDecoder_Trainer:

        for i, batch in enumerate(loader):

-            # Data is automatically moved to device by Accelerator
+            # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
+            use_tpu = self.args.get('use_tpu', False)
+            if use_tpu:
+                # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
+                features = batch['input_features'].to(self.device)
+                labels = batch['seq_class_ids'].to(self.device)
+                n_time_steps = batch['n_time_steps'].to(self.device)
+                phone_seq_lens = batch['phone_seq_lens'].to(self.device)
+                day_indicies = batch['day_indicies'].to(self.device)
+            else:
+                # For GPU/CPU, data is automatically moved to device by Accelerator
                features = batch['input_features']
                labels = batch['seq_class_ids']
                n_time_steps = batch['n_time_steps']
@@ -838,7 +875,15 @@ class BrainToTextDecoder_Trainer:
        '''
        self.model.eval()

-        # Data is automatically moved to device by Accelerator
+        # Handle data movement - for TPU, manually move to device since DataLoader wasn't prepared by Accelerator
+        use_tpu = self.args.get('use_tpu', False)
+        if use_tpu:
+            # Manual data movement for TPU since DataLoaders are not prepared by Accelerator
+            features = batch['input_features'].to(self.device)
+            day_indicies = batch['day_indicies'].to(self.device)
+            n_time_steps = batch['n_time_steps'].to(self.device)
+        else:
+            # For GPU/CPU, data is automatically moved to device by Accelerator
            features = batch['input_features']
            day_indicies = batch['day_indicies']
            n_time_steps = batch['n_time_steps']