tpu not find

2025-10-15 23:29:32 +08:00
parent ec8509ad31
commit 01024678c1
1 changed files with 59 additions and 5 deletions
--- a/model_training_nnn_tpu/rnn_model_tf.py
+++ b/model_training_nnn_tpu/rnn_model_tf.py
@@ -759,20 +759,74 @@ class CTCLoss(keras.losses.Loss):
 # TPU Strategy Helper Functions
 def create_tpu_strategy():
    """Create TPU strategy for distributed training on TPU v5e-8"""
    import os
    print("🔍 Detecting TPU environment...")
    # Check for various TPU environment variables
    tpu_address = None
    tpu_name = None
    # Check common TPU environment variables
    if 'COLAB_TPU_ADDR' in os.environ:
        tpu_address = os.environ['COLAB_TPU_ADDR']
        print(f"📍 Found Colab TPU address: {tpu_address}")
    elif 'TPU_NAME' in os.environ:
        tpu_name = os.environ['TPU_NAME']
        print(f"📍 Found TPU name: {tpu_name}")
    elif 'TPU_WORKER_ID' in os.environ:
        # Kaggle TPU environment
        worker_id = os.environ.get('TPU_WORKER_ID', '0')
        tpu_address = f'grpc://10.0.0.2:8470'  # Default Kaggle TPU address
        print(f"📍 Kaggle TPU detected, worker ID: {worker_id}, address: {tpu_address}")
    # Print all TPU-related environment variables for debugging
    print("🔧 TPU environment variables:")
    tpu_vars = {k: v for k, v in os.environ.items() if 'TPU' in k or 'COLAB' in k}
    for key, value in tpu_vars.items():
        print(f"   {key}={value}")
    try:
        # Try different TPU resolver configurations
        resolver = None
        if tpu_address:
            print(f"🚀 Attempting TPU connection with address: {tpu_address}")
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
        elif tpu_name:
            print(f"🚀 Attempting TPU connection with name: {tpu_name}")
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=tpu_name)
        else:
            # Try auto-detection
            print("🚀 Attempting TPU auto-detection...")
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        # Initialize TPU
-        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+        print("⚡ Connecting to TPU cluster...")
        tf.config.experimental_connect_to_cluster(resolver)
        print("🔧 Initializing TPU system...")
        tf.tpu.experimental.initialize_tpu_system(resolver)
        # Create TPU strategy
        print("🎯 Creating TPU strategy...")
        strategy = tf.distribute.TPUStrategy(resolver)
-        print(f"TPU initialized successfully. Number of replicas: {strategy.num_replicas_in_sync}")
+
        print(f"✅ TPU initialized successfully!")
        print(f"🎉 Number of TPU cores: {strategy.num_replicas_in_sync}")
        print(f"🏃 TPU cluster: {resolver.cluster_spec()}")
        return strategy
-    except ValueError as e:
+    except Exception as e:
-        print(f"Failed to initialize TPU: {e}")
+        print(f"❌ Failed to initialize TPU: {e}")
-        print("Falling back to default strategy")
+        print(f"🔍 Error type: {type(e).__name__}")
        # Enhanced error reporting
        if "Please provide a TPU Name" in str(e):
            print("💡 Hint: TPU name/address not found in environment variables")
            print("   Common variables: COLAB_TPU_ADDR, TPU_NAME, TPU_WORKER_ID")
        print("🔄 Falling back to default strategy (CPU/GPU)")
        return tf.distribute.get_strategy()