76 lines
2.4 KiB
Python
76 lines
2.4 KiB
Python
![]() |
#!/usr/bin/env python3
|
||
|
|
||
|
import pickle
|
||
|
from pathlib import Path
|
||
|
|
||
|
def check_dataset():
|
||
|
"""Pure Python check of the dataset without dependencies"""
|
||
|
|
||
|
dataset_file = Path("phoneme_segmented_data/ctc_results_20251009_000024.pkl")
|
||
|
|
||
|
if not dataset_file.exists():
|
||
|
print(f"❌ File not found: {dataset_file}")
|
||
|
return False
|
||
|
|
||
|
print(f"📁 Dataset: {dataset_file}")
|
||
|
print(f"📏 Size: {dataset_file.stat().st_size / (1024*1024):.1f} MB")
|
||
|
|
||
|
try:
|
||
|
# Load the dataset (it's saved as batches)
|
||
|
all_trials = []
|
||
|
with open(dataset_file, 'rb') as f:
|
||
|
while True:
|
||
|
try:
|
||
|
batch = pickle.load(f)
|
||
|
all_trials.extend(batch)
|
||
|
except EOFError:
|
||
|
break
|
||
|
|
||
|
print(f"✅ Loaded {len(all_trials)} total trials")
|
||
|
|
||
|
# Analyze alignment_info for temporal ordering
|
||
|
total_segments = 0
|
||
|
temporal_errors = 0
|
||
|
sample_outputs = []
|
||
|
|
||
|
for i, trial in enumerate(all_trials):
|
||
|
if 'alignment_info' not in trial:
|
||
|
continue
|
||
|
|
||
|
alignment_info = trial['alignment_info']
|
||
|
|
||
|
for phoneme, start_time, end_time, confidence in alignment_info:
|
||
|
total_segments += 1
|
||
|
|
||
|
# Check temporal ordering
|
||
|
if end_time < start_time:
|
||
|
temporal_errors += 1
|
||
|
if len(sample_outputs) < 5: # Collect first 5 errors as examples
|
||
|
sample_outputs.append(f"Trial {i}: '{phoneme}' {start_time}->{end_time}")
|
||
|
|
||
|
print(f"📊 Analysis Results:")
|
||
|
print(f" Total phoneme segments: {total_segments}")
|
||
|
print(f" Temporal ordering errors: {temporal_errors}")
|
||
|
|
||
|
if temporal_errors > 0:
|
||
|
error_rate = (temporal_errors / total_segments) * 100
|
||
|
print(f" Error rate: {error_rate:.3f}%")
|
||
|
print(f" Sample errors:")
|
||
|
for error in sample_outputs:
|
||
|
print(f" {error}")
|
||
|
|
||
|
return temporal_errors == 0
|
||
|
|
||
|
except Exception as e:
|
||
|
print(f"❌ Error loading dataset: {e}")
|
||
|
return False
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
success = check_dataset()
|
||
|
|
||
|
if success:
|
||
|
print(f"\n🎉 VERIFICATION SUCCESS!")
|
||
|
print(f"All phoneme segments have proper temporal ordering (end_time >= start_time)")
|
||
|
else:
|
||
|
print(f"\n❌ VERIFICATION FAILED!")
|
||
|
print(f"Some temporal ordering issues detected")
|