From eefff1ce5e5d0ea3be29f1ebd3dc16cb785ab33a Mon Sep 17 00:00:00 2001 From: Zchen <161216199+ZH-CEN@users.noreply.github.com> Date: Thu, 16 Oct 2025 21:40:43 +0800 Subject: [PATCH] fix --- model_training_nnn_tpu/trainer_tf.py | 121 +++++++++++++++++++-------- 1 file changed, 85 insertions(+), 36 deletions(-) diff --git a/model_training_nnn_tpu/trainer_tf.py b/model_training_nnn_tpu/trainer_tf.py index 3e27fa4..970d858 100644 --- a/model_training_nnn_tpu/trainer_tf.py +++ b/model_training_nnn_tpu/trainer_tf.py @@ -100,27 +100,40 @@ class BrainToTextDecoderTrainerTF: # Force optimizer to build its internal state within strategy scope # This prevents the 'NoneType' strategy error during first apply_gradients try: - # Check if strategy is properly initialized before applying gradients - if hasattr(self.strategy, 'merge_call') and callable(getattr(self.strategy, 'merge_call')): - print("✅ Strategy has merge_call, building optimizer properly...") + print("✅ Building optimizer with complete state initialization...") - # Build optimizer by explicitly calling build method - self.optimizer.build(self.model.trainable_variables) - print("✅ Optimizer built with model variables") + # First, explicitly build the optimizer with model variables + print(f"Building optimizer with {len(self.model.trainable_variables)} variables") + self.optimizer.build(self.model.trainable_variables) + print("✅ Optimizer built with model variables") - # Test with dummy gradients to ensure everything works - dummy_grads = [tf.zeros_like(w) for w in self.model.trainable_variables] - self.optimizer.apply_gradients(zip(dummy_grads, self.model.trainable_variables)) - print("✅ Optimizer state pre-built successfully with TPU strategy") + # Create dummy gradients and variables for full state initialization + dummy_grads = [tf.zeros_like(var) for var in self.model.trainable_variables] + print(f"Created {len(dummy_grads)} dummy gradients") + + # Apply dummy gradients to fully initialize optimizer state + # This ensures all optimizer variables are created within the strategy scope + self.optimizer.apply_gradients(zip(dummy_grads, self.model.trainable_variables)) + print("✅ Optimizer state fully initialized with dummy gradients") + + # Verify optimizer is properly built + print(f"Optimizer iterations: {self.optimizer.iterations}") + print(f"Optimizer built: {self.optimizer.built}") + + # Print optimizer variable names for debugging + if hasattr(self.optimizer, 'variables') and self.optimizer.variables: + print(f"Optimizer has {len(self.optimizer.variables)} internal variables") else: - # Fallback: just build optimizer variables without applying gradients - print("⚠️ Strategy not fully initialized, using fallback optimizer build") - # Force build the optimizer with the model variables - self.optimizer.build(self.model.trainable_variables) - print("✅ Optimizer built in fallback mode") + print("⚠️ Optimizer has no internal variables - this might cause issues") + + print("✅ Optimizer pre-build completed successfully") + except Exception as e: - print(f"⚠️ Warning: Could not pre-build optimizer state: {e}") - print("✅ Continuing without optimizer pre-build - optimizer will build during first training step") + print(f"❌ CRITICAL: Could not pre-build optimizer state: {e}") + print(f"Error type: {type(e).__name__}") + import traceback + print(f"Full traceback: {traceback.format_exc()}") + raise RuntimeError(f"Optimizer pre-build failed: {e}") from e print("📅 Setting up learning rate scheduler...") self.lr_scheduler = self._create_lr_scheduler() @@ -422,15 +435,31 @@ class BrainToTextDecoderTrainerTF: # Note: TensorFlow doesn't have the same parameter group functionality as PyTorch # We'll use a single optimizer and handle different learning rates in the scheduler - # Create optimizer within strategy scope to ensure proper initialization print(f"Creating optimizer with strategy: {type(self.strategy).__name__}") - optimizer = tf.keras.optimizers.AdamW( - learning_rate=self.args['lr_max'], - beta_1=self.args['beta0'], - beta_2=self.args['beta1'], - epsilon=self.args['epsilon'], - weight_decay=self.args['weight_decay'] - ) + + # For TPU training, we need to be more explicit about optimizer configuration + # to avoid strategy context issues + if isinstance(self.strategy, tf.distribute.TPUStrategy): + print("Using TPU-optimized optimizer configuration") + # TPU-specific optimizer configuration + optimizer = tf.keras.optimizers.AdamW( + learning_rate=self.args['lr_max'], + beta_1=self.args['beta0'], + beta_2=self.args['beta1'], + epsilon=self.args['epsilon'], + weight_decay=self.args['weight_decay'], + # TPU-specific settings + global_clipnorm=self.args.get('grad_norm_clip_value', 0.0) if self.args.get('grad_norm_clip_value', 0.0) > 0 else None + ) + else: + print("Using standard optimizer configuration") + optimizer = tf.keras.optimizers.AdamW( + learning_rate=self.args['lr_max'], + beta_1=self.args['beta0'], + beta_2=self.args['beta1'], + epsilon=self.args['epsilon'], + weight_decay=self.args['weight_decay'] + ) return optimizer @@ -475,7 +504,6 @@ class BrainToTextDecoderTrainerTF: total_params = sum([tf.size(w).numpy() for w in self.model.trainable_weights]) self.logger.info(f"Model has {total_params:,} trainable parameters") - @tf.function def _train_step(self, batch, step): """Single training step with gradient tape""" features = batch['input_features'] @@ -575,22 +603,43 @@ class BrainToTextDecoderTrainerTF: # Apply gradients (only for variables that have gradients) if len(filtered_gradients) > 0: - # Ensure we're in the strategy scope when applying gradients - # This prevents the 'NoneType' extended attribute error + # Apply gradients with comprehensive error handling + # The optimizer should already be built and have all necessary variables try: + # Check if optimizer is properly built before applying gradients + if not self.optimizer.built: + print("WARNING: Optimizer not built, building now...") + # This should not happen if pre-build worked correctly + self.optimizer.build(filtered_variables) + + # Apply gradients - this should work since optimizer is pre-built self.optimizer.apply_gradients(zip(filtered_gradients, filtered_variables)) + except AttributeError as e: - if "'NoneType' object has no attribute 'extended'" in str(e): - # Strategy context was lost, this should not happen in a @tf.function - tf.print(f"ERROR: Strategy context lost during gradient application: {e}") - tf.print("This indicates a serious issue with the distributed training setup") - raise RuntimeError(f"Strategy context lost during training: {e}") - else: - raise + print("CRITICAL ERROR in gradient application:") + print(f"Error: {e}") + print("This indicates the optimizer lost its strategy context") + print(f"Optimizer built: {self.optimizer.built}") + print(f"Number of gradients: {len(filtered_gradients)}") + print(f"Number of variables: {len(filtered_variables)}") + + # Check current strategy + current_strategy = tf.distribute.get_strategy() + print(f"Current strategy: {type(current_strategy).__name__}") + print(f"Training strategy: {type(self.strategy).__name__}") + + # Re-raise with more context + raise RuntimeError(f"Gradient application failed - optimizer strategy context lost: {e}") + + except Exception as e: + # Catch any other errors during gradient application + print("Unexpected error during gradient application:") + print(f"Error type: {type(e).__name__}") + print(f"Error message: {e}") + raise return loss, grad_norm - @tf.function def _validation_step(self, batch): """Single validation step""" features = batch['input_features']