b2txt25/data_analyse/pure_python_check.py

#!/usr/bin/env python3

import pickle
from pathlib import Path

def check_dataset():
    """Pure Python check of the dataset without dependencies"""

    dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")

    if not dataset_file.exists():
        print(f"❌ File not found: {dataset_file}")
        return False

    print(f"📁 Dataset: {dataset_file}")
    print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")

    try:
        # Load the dataset (it's saved as batches)
        all_trials = []
        with open(dataset_file, 'rb') as f:
            while True:
                try:
                    batch = pickle.load(f)
                    all_trials.extend(batch)
                except EOFError:
                    break

        print(f"✅ Loaded {len(all_trials)} total trials")

        # Analyze alignment_info for temporal ordering
        total_segments = 0
        temporal_errors = 0
        sample_outputs = []

        for i, trial in enumerate(all_trials):
            if 'alignment_info' not in trial:
                continue

            alignment_info = trial['alignment_info']

            for phoneme, start_time, end_time, confidence in alignment_info:
                total_segments += 1

                # Check temporal ordering
                if end_time < start_time:
                    temporal_errors += 1
                    if len(sample_outputs) < 5:  # Collect first 5 errors as examples
                        sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")

        print(f"📊 Analysis Results:")
        print(f"  Total phoneme segments: {total_segments}")
        print(f"  Temporal ordering errors: {temporal_errors}")

        if temporal_errors > 0:
            error_rate = (temporal_errors / total_segments) * 100
            print(f"  Error rate: {error_rate:.3f}%")
            print(f"  Sample errors:")
            for error in sample_outputs:
                print(f"    {error}")

        return temporal_errors == 0

    except Exception as e:
        print(f"❌ Error loading dataset: {e}")
        return False

if __name__ == "__main__":
    success = check_dataset()

    if success:
        print(f"\n🎉 VERIFICATION SUCCESS!")
        print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
    else:
        print(f"\n❌ VERIFICATION FAILED!")
        print(f"Some temporal ordering issues detected")
备份1 2025-10-12 09:11:32 +08:00			`#!/usr/bin/env python3`

			`import pickle`
			`from pathlib import Path`

			`def check_dataset():`
			`"""Pure Python check of the dataset without dependencies"""`

			`dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")`

			`if not dataset_file.exists():`
			`print(f"❌ File not found: {dataset_file}")`
			`return False`

			`print(f"📁 Dataset: {dataset_file}")`
			`print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")`

			`try:`
			`# Load the dataset (it's saved as batches)`
			`all_trials = []`
			`with open(dataset_file, 'rb') as f:`
			`while True:`
			`try:`
			`batch = pickle.load(f)`
			`all_trials.extend(batch)`
			`except EOFError:`
			`break`

			`print(f"✅ Loaded {len(all_trials)} total trials")`

			`# Analyze alignment_info for temporal ordering`
			`total_segments = 0`
			`temporal_errors = 0`
			`sample_outputs = []`

			`for i, trial in enumerate(all_trials):`
			`if 'alignment_info' not in trial:`
			`continue`

			`alignment_info = trial['alignment_info']`

			`for phoneme, start_time, end_time, confidence in alignment_info:`
			`total_segments += 1`

			`# Check temporal ordering`
			`if end_time < start_time:`
			`temporal_errors += 1`
			`if len(sample_outputs) < 5: # Collect first 5 errors as examples`
			`sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")`

			`print(f"📊 Analysis Results:")`
			`print(f" Total phoneme segments: {total_segments}")`
			`print(f" Temporal ordering errors: {temporal_errors}")`

			`if temporal_errors > 0:`
			`error_rate = (temporal_errors / total_segments) * 100`
			`print(f" Error rate: {error_rate:.3f}%")`
			`print(f" Sample errors:")`
			`for error in sample_outputs:`
			`print(f" {error}")`

			`return temporal_errors == 0`

			`except Exception as e:`
			`print(f"❌ Error loading dataset: {e}")`
			`return False`

			`if __name__ == "__main__":`
			`success = check_dataset()`

			`if success:`
			`print(f"\n🎉 VERIFICATION SUCCESS!")`
			`print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")`
			`else:`
			`print(f"\n❌ VERIFICATION FAILED!")`
			`print(f"Some temporal ordering issues detected")`