#!/usr/bin/env python3 import pickle from pathlib import Path def check_dataset(): """Pure Python check of the dataset without dependencies""" dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl") if not dataset_file.exists(): print(f"āŒ File not found: {dataset_file}") return False print(f"šŸ“ Dataset: {dataset_file}") print(f"šŸ“ Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB") try: # Load the dataset (it's saved as batches) all_trials = [] with open(dataset_file, 'rb') as f: while True: try: batch = pickle.load(f) all_trials.extend(batch) except EOFError: break print(f"āœ… Loaded {len(all_trials)} total trials") # Analyze alignment_info for temporal ordering total_segments = 0 temporal_errors = 0 sample_outputs = [] for i, trial in enumerate(all_trials): if 'alignment_info' not in trial: continue alignment_info = trial['alignment_info'] for phoneme, start_time, end_time, confidence in alignment_info: total_segments += 1 # Check temporal ordering if end_time < start_time: temporal_errors += 1 if len(sample_outputs) < 5: # Collect first 5 errors as examples sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}") print(f"šŸ“Š Analysis Results:") print(f" Total phoneme segments: {total_segments}") print(f" Temporal ordering errors: {temporal_errors}") if temporal_errors > 0: error_rate = (temporal_errors / total_segments) * 100 print(f" Error rate: {error_rate:.3f}%") print(f" Sample errors:") for error in sample_outputs: print(f" {error}") return temporal_errors == 0 except Exception as e: print(f"āŒ Error loading dataset: {e}") return False if __name__ == "__main__": success = check_dataset() if success: print(f"\nšŸŽ‰ VERIFICATION SUCCESS!") print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)") else: print(f"\nāŒ VERIFICATION FAILED!") print(f"Some temporal ordering issues detected")